Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
name: CI

on:
push:
branches:
- "main"
pull_request:

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

jobs:
lint-python:
name: Lint Python
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
cache: "pip"

- name: Run flake8
uses: py-actions/flake8@v2

validate-compute-block:
name: Validate Compute Block Config
runs-on: ubuntu-latest
needs: lint-python
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5

- name: Intall dependencies
run: |
pip install -r requirements.txt

- name: Check cbcs
run: |
python3 - <<'EOF'
import main

from scystream.sdk.config import load_config, get_compute_block
from scystream.sdk.config.config_loader import _compare_configs
from pathlib import Path

CBC_PATH = Path("cbc.yaml")

if not CBC_PATH.exists():
raise FileNotFoundError("cbc.yaml not found in repo root.")

block_from_code = get_compute_block()
block_from_yaml = load_config(str(CBC_PATH))

_compare_configs(block_from_code, block_from_yaml)

print("cbc.yaml matches python code definition")
EOF

run-test:
name: Run Tests
runs-on: ubuntu-latest
needs: validate-compute-block
services:
minio:
image: lazybit/minio
ports:
- 9000:9000
env:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
options: >-
--health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1"
--health-interval 5s
--health-retries 5
--health-timeout 5s
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
cache: "pip"

- name: Install dependencies
run: |
pip install -r requirements.txt

- name: Run Tests
run: pytest -vv

build:
name: Build Docker Image
runs-on: ubuntu-latest
needs: run-test
permissions:
contents: read
packages: write
steps:
- name: Checkout Repository
uses: actions/checkout@v4

- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata for docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/language-preprocessing
tags: |
type=ref, event=pr
type=raw, value=latest, enable=${{ (github.ref == format('refs/heads/{0}', 'main')) }}

- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

44 changes: 0 additions & 44 deletions .github/workflows/docker.yaml

This file was deleted.

112 changes: 112 additions & 0 deletions input.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@

@article{ WOS:001016714700004,
Author = {White, Joel},
Title = {Theoretical and Practical Paralogisms of Digital Immortality},
Journal = {JOURNAL OF AESTHETICS AND PHENOMENOLOGY},
Year = {2022},
Volume = {9},
Number = {2, SI},
Pages = {155-172},
Month = {JUL 3},
Abstract = {Modern and contemporary transhumanism has seen a recent rise in academic
and popular relevance; specific naive metaphysical ideas, such as
immortality, have returned with this rise. This article refrains from
any ethical or political assessment of transhumanism. Still, it
critiques the exact metaphysical or idealistic nature of transhumanism
and its pursuit of digital immortality: the idea that, through
technological advancements, precisely in Artificial General
Intelligence, an immortal virtual ``self{''} will become possible. The
article follows the form of Immanuel Kant's ``Paralogisms{''} from the
Critique of Pure Reason, where Kant is concerned with the substantial,
immortal nature of the soul and its experiential impossibility. The
article will offer theoretical and practical paralogisms (false logical
inferences), arguing that the transhumanist claim that digital
immortality is possible fundamentally stems from two incorrect major
premises. The first concerns the substantial nature of information,
which informs the theoretical paralogisms; the second concerns infinite
transformation (pure plasticity), which informs the practical
paralogisms},
Publisher = {ROUTLEDGE JOURNALS, TAYLOR \& FRANCIS LTD},
Address = {2-4 PARK SQUARE, MILTON PARK, ABINGDON OX14 4RN, OXON, ENGLAND},
Type = {Article},
Language = {English},
DOI = {10.1080/20539320.2022.2150463},
ISSN = {2053-9320},
EISSN = {2053-9339},
Keywords = {Transhumanism; Critical Philosophy; Immanuel Kant; Entropy; Paralogisms;
Digital Immortality},
Research-Areas = {Philosophy},
Web-of-Science-Categories = {Philosophy},
Author-Email = {jhmw01@gmail.com},
ORCID-Numbers = {White, Joel/0000-0001-6460-0564},
Number-of-Cited-References = {30},
Times-Cited = {0},
Usage-Count-Last-180-days = {3},
Usage-Count-Since-2013 = {15},
Journal-ISO = {J. Aesthet. Phenomenol.},
Doc-Delivery-Number = {K5GF0},
Web-of-Science-Index = {Emerging Sources Citation Index (ESCI)},
Unique-ID = {WOS:001016714700004},
DA = {2025-06-26},
}

@article{ WOS:001322577100012,
Author = {Kant, Vivek and Khanganba, Sanjram Premjit and Dixit, Sudhir},
Title = {Sociopolitical Challenges to Digital Transformation of Rural
Communities: Learnings from a Case Study From Manipur, India},
Journal = {IT PROFESSIONAL},
Year = {2024},
Volume = {26},
Number = {4},
Pages = {42-47},
Month = {JUL-AUG},
Abstract = {The United Nations Panel on Digital Cooperation, 2019, has emphasized
the inclusive growth of digital networks and digital public goods,
utilizing a multistakeholder systems approach. Similarly, the
information and communications technology (ICT) Innovation and
Intervention Program of the Government of India's Digital North East
Vision 2022 has also emphasized a need for inclusive growth of ICT in
the Northeast Region. In line with the above, this article presents
insights from a field study conducted in the rural parts of Manipur,
India, which incidentally can be found to be applicable to many rural
parts of the developing world. The article envisions a community-driven
sociodigital transformation of the Northeast Region of India. In this
quest, the article highlights sociopolitical challenges for digital
transformation and provides insights for inclusive ICT in such
regions-infrastructure as a utility for every citizen, smart governance
and services on demand, digital empowerment of citizens, social welfare,
capacity building, and community engagement.},
Publisher = {IEEE COMPUTER SOC},
Address = {10662 LOS VAQUEROS CIRCLE, PO BOX 3014, LOS ALAMITOS, CA 90720-1314 USA},
Type = {Article},
Language = {English},
Affiliation = {Kant, V (Corresponding Author), Indian Inst Technol Kanpur, Kanpur 208016, India.
Kant, Vivek, Indian Inst Technol Kanpur, Kanpur 208016, India.
Khanganba, Sanjram Premjit, Indian Inst Technol Indore, Indore 452020, India.
Dixit, Sudhir, Basic Internet Fdn, Oslo, Norway.},
DOI = {10.1109/MITP.2024.3433459},
ISSN = {1520-9202},
EISSN = {1941-045X},
Keywords = {Technological innovation; Digital transformation; Government; Buildings;
Asia; Africa; Information and communication technology},
Research-Areas = {Computer Science; Telecommunications},
Web-of-Science-Categories = {Computer Science, Information Systems; Computer Science, Software
Engineering; Telecommunications},
Author-Email = {vkant@iitk.ac.in
sanjrampk@iiti.ac.in
sudhir.dixit@ieee.org},
Affiliations = {Indian Institute of Technology System (IIT System); Indian Institute of
Technology (IIT) - Kanpur; Indian Institute of Technology System (IIT
System); Indian Institute of Technology (IIT) - Indore},
ResearcherID-Numbers = {/ITU-6308-2023},
ORCID-Numbers = {/0000-0002-6215-7500},
Number-of-Cited-References = {7},
Times-Cited = {0},
Usage-Count-Last-180-days = {11},
Usage-Count-Since-2013 = {22},
Journal-ISO = {IT Prof.},
Doc-Delivery-Number = {H3O9D},
Web-of-Science-Index = {Science Citation Index Expanded (SCI-EXPANDED)},
Unique-ID = {WOS:001322577100012},
DA = {2025-06-26},
}
File renamed without changes.
47 changes: 5 additions & 42 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

from scystream.sdk.core import entrypoint
from scystream.sdk.env.settings import (
EnvSettings,
InputSettings,
OutputSettings,
FileSettings
EnvSettings,
InputSettings,
OutputSettings,
FileSettings
)
from scystream.sdk.file_handling.s3_manager import S3Operations

Expand Down Expand Up @@ -81,7 +81,7 @@ def _preprocess_and_store(texts, settings):
dtm, vocab = pre.generate_document_term_matrix()

with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:

pickle.dump(dtm, tmp_dtm)
tmp_dtm.flush()
Expand All @@ -108,40 +108,3 @@ def preprocess_bib_file(settings):
attribute=settings.bib_input.SELECTED_ATTRIBUTE,
)
_preprocess_and_store(texts, settings)


"""
if __name__ == "__main__":
test = PreprocessBIB(
bib_input=BIBFileInput(
S3_HOST="http://localhost",
S3_PORT="9000",
S3_ACCESS_KEY="minioadmin",
S3_SECRET_KEY="minioadmin",
BUCKET_NAME="input-bucket",
FILE_PATH="input_file_path",
FILE_NAME="wos_export",
SELECTED_ATTRIBUTE="abstract"
),
dtm_output=DTMFileOutput(
S3_HOST="http://localhost",
S3_PORT="9000",
S3_ACCESS_KEY="minioadmin",
S3_SECRET_KEY="minioadmin",
BUCKET_NAME="output-bucket",
FILE_PATH="output_file_path",
FILE_NAME="dtm_file_bib"
),
vocab_output=VocabFileOutput(
S3_HOST="http://localhost",
S3_PORT="9000",
S3_ACCESS_KEY="minioadmin",
S3_SECRET_KEY="minioadmin",
BUCKET_NAME="output-bucket",
FILE_PATH="output_file_path",
FILE_NAME="vocab_file_bib"
)
)

preprocess_bib_file(test)
"""
Empty file added preprocessing/__init__.py
Empty file.
14 changes: 7 additions & 7 deletions preprocessing/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@
def normalize_text(text: str) -> str:
if not text:
return ""
# Remove curly braces
text = re.sub(r"[{}]", "", text)

# Remove LaTeX commands
text = re.sub(r"\\[a-zA-Z]+\s*(\{[^}]*\})?", "", text)
text = re.sub(r"\\[a-zA-Z]+\{([^}]*)\}", r"\1", text)

text = re.sub(r"\\[a-zA-Z]+", "", text)

# Remove LaTeX escaped quotes/accents
text = re.sub(r"\\""[a-zA-Z]", lambda m: m.group(0)[-1], text)
text = re.sub(r"[{}]", "", text)

text = re.sub(r'\\"([a-zA-Z])', r'\1', text)

text = re.sub(r"\\'", "", text)
text = text.replace("'", "")

text = re.sub(r"\s+", " ", text)

return text.strip()
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ spacy==3.8.7
nltk==3.9.1
numpy==2.3.3
bibtexparser==1.4.3
pytest==9.0.1
Empty file added test/__init__.py
Empty file.
Loading
Loading