RWTH-TIME · PaulKalho · Dec 17, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,129 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - "main"
+  pull_request:
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  lint-python:
+    name: Lint Python
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          cache: "pip"
+
+      - name: Run flake8
+        uses: py-actions/flake8@v2
+
+  validate-compute-block:
+    name: Validate Compute Block Config
+    runs-on: ubuntu-latest
+    needs: lint-python
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+
+      - name: Intall dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Check cbcs
+        run: |
+          python3 - <<'EOF'
+          import main
+
+          from scystream.sdk.config import load_config, get_compute_block
+          from scystream.sdk.config.config_loader import _compare_configs
+          from pathlib import Path
+
+          CBC_PATH = Path("cbc.yaml")
+
+          if not CBC_PATH.exists():
+              raise FileNotFoundError("cbc.yaml not found in repo root.")
+
+          block_from_code = get_compute_block()
+          block_from_yaml = load_config(str(CBC_PATH))
+
+          _compare_configs(block_from_code, block_from_yaml)
+
+          print("cbc.yaml matches python code definition")
+          EOF
+
+  run-test:
+    name: Run Tests
+    runs-on: ubuntu-latest
+    needs: validate-compute-block
+    services:
+      minio:
+        image: lazybit/minio
+        ports:
+          - 9000:9000
+        env:
+          MINIO_ROOT_USER: minioadmin
+          MINIO_ROOT_PASSWORD: minioadmin
+        options: >-
+          --health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1"
+          --health-interval 5s
+          --health-retries 5
+          --health-timeout 5s
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Run Tests
+        run: pytest -vv
+
+  build:
+    name: Build Docker Image
+    runs-on: ubuntu-latest
+    needs: run-test
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata for docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/language-preprocessing
+          tags: |
+            type=ref, event=pr
+            type=raw, value=latest, enable=${{ (github.ref == format('refs/heads/{0}', 'main')) }}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
diff --git a/cbc.yaml b/cbc.yaml
@@ -1,15 +1,16 @@
 author: Paul Kalhorn 
 description: Language preprocessing for .txt or .bib files
-docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
+docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing 
 entrypoints:
   preprocess_bib_file:
-    description: Entrypoint for preprocessing a .bib file
+    description: Entrypoint for preprocessing a .bib file 
     envs:
+      BIB_DOWNLOAD_PATH: /tmp/input.bib
       FILTER_STOPWORDS: true
       LANGUAGE: en
       NGRAM_MAX: 3
       NGRAM_MIN: 2
-      UNIGRAM_NORMALIZER: porter
+      UNIGRAM_NORMALIZER: lemma
       USE_NGRAMS: true
     inputs:
       bib_input:
@@ -23,7 +24,7 @@ entrypoints:
           bib_file_S3_PORT: null
           bib_file_S3_SECRET_KEY: null
           bib_file_SELECTED_ATTRIBUTE: Abstract
-        description: The bib file, aswell as one attribute selected for preprocessing 
+        description: The bib file, aswell as one attribute selected for preprocessing
         type: file
     outputs:
       dtm_output:
@@ -36,7 +37,7 @@ entrypoints:
           dtm_output_S3_HOST: null
           dtm_output_S3_PORT: null
           dtm_output_S3_SECRET_KEY: null
-        description: Numpy representation of document-term matrix as .pkl file 
+        description: Numpy representation of document-term matrix as .pkl file
         type: file
       vocab_output:
         config:
@@ -57,7 +58,8 @@ entrypoints:
       LANGUAGE: en
       NGRAM_MAX: 3
       NGRAM_MIN: 2
-      UNIGRAM_NORMALIZER: porter
+      TXT_DOWNLOAD_PATH: /tmp/input.txt
+      UNIGRAM_NORMALIZER: lemma
       USE_NGRAMS: true
     inputs:
       txt_input:
@@ -70,7 +72,7 @@ entrypoints:
           txt_file_S3_HOST: null
           txt_file_S3_PORT: null
           txt_file_S3_SECRET_KEY: null
-        description: A .txt file 
+        description: A .txt file
         type: file
     outputs:
       dtm_output:

diff --git a/main.py b/main.py
@@ -1,18 +1,25 @@
 import pickle
 import tempfile
+import logging
 
 from scystream.sdk.core import entrypoint
 from scystream.sdk.env.settings import (
-        EnvSettings,
-        InputSettings,
-        OutputSettings,
-        FileSettings
+    EnvSettings,
+    InputSettings,
+    OutputSettings,
+    FileSettings
 )
 from scystream.sdk.file_handling.s3_manager import S3Operations
 
 from preprocessing.core import Preprocessor
 from preprocessing.loader import TxtLoader, BibLoader
 
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
 
 class DTMFileOutput(FileSettings, OutputSettings):
     __identifier__ = "dtm_output"
@@ -46,6 +53,8 @@ class PreprocessTXT(EnvSettings):
     NGRAM_MIN: int = 2
     NGRAM_MAX: int = 3
 
+    TXT_DOWNLOAD_PATH: str = "/tmp/input.txt"
+
     txt_input: TXTFileInput
     dtm_output: DTMFileOutput
     vocab_output: VocabFileOutput
@@ -59,13 +68,17 @@ class PreprocessBIB(EnvSettings):
     NGRAM_MIN: int = 2
     NGRAM_MAX: int = 3
 
+    BIB_DOWNLOAD_PATH: str = "/tmp/input.bib"
+
     bib_input: BIBFileInput
     dtm_output: DTMFileOutput
     vocab_output: VocabFileOutput
 
 
 def _preprocess_and_store(texts, settings):
     """Shared preprocessing logic for TXT and BIB."""
+    logger.info(f"Starting preprocessing with {len(texts)} documents")
+
     pre = Preprocessor(
         language=settings.LANGUAGE,
         filter_stopwords=settings.FILTER_STOPWORDS,
@@ -74,74 +87,49 @@ def _preprocess_and_store(texts, settings):
         ngram_min=settings.NGRAM_MIN,
         ngram_max=settings.NGRAM_MAX,
     )
-    pre.texts = texts
 
+    pre.texts = texts
     pre.analyze_texts()
+
     pre.generate_bag_of_words()
+
     dtm, vocab = pre.generate_document_term_matrix()
 
     with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
-         tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
+            tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
 
         pickle.dump(dtm, tmp_dtm)
         tmp_dtm.flush()
 
         pickle.dump(vocab, tmp_vocab)
         tmp_vocab.flush()
 
+        logger.info("Uploading DTM to S3...")
         S3Operations.upload(settings.dtm_output, tmp_dtm.name)
+
+        logger.info("Uploading vocabulary to S3...")
         S3Operations.upload(settings.vocab_output, tmp_vocab.name)
 
+    logger.info("Preprocessing completed successfully.")
+
 
 @entrypoint(PreprocessTXT)
 def preprocess_txt_file(settings):
-    S3Operations.download(settings.txt_input, "input.txt")
-    texts = TxtLoader.load("./input.txt")
+    logger.info("Downloading TXT input from S3...")
+    S3Operations.download(settings.txt_input, settings.TXT_DOWNLOAD_PATH)
+
+    texts = TxtLoader.load(settings.TXT_DOWNLOAD_PATH)
+
     _preprocess_and_store(texts, settings)
 
 
 @entrypoint(PreprocessBIB)
 def preprocess_bib_file(settings):
-    S3Operations.download(settings.bib_input, "input.bib")
+    logger.info("Downloading BIB input from S3...")
+    S3Operations.download(settings.bib_input, settings.BIB_DOWNLOAD_PATH)
+
     texts = BibLoader.load(
-        "./input.bib",
+        settings.BIB_DOWNLOAD_PATH,
         attribute=settings.bib_input.SELECTED_ATTRIBUTE,
     )
     _preprocess_and_store(texts, settings)
-
-
-"""
-if __name__ == "__main__":
-    test = PreprocessBIB(
-        bib_input=BIBFileInput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="input-bucket",
-            FILE_PATH="input_file_path",
-            FILE_NAME="wos_export",
-            SELECTED_ATTRIBUTE="abstract"
-        ),
-        dtm_output=DTMFileOutput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="output-bucket",
-            FILE_PATH="output_file_path",
-            FILE_NAME="dtm_file_bib"
-        ),
-        vocab_output=VocabFileOutput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="output-bucket",
-            FILE_PATH="output_file_path",
-            FILE_NAME="vocab_file_bib"
-        )
-    )
-
-    preprocess_bib_file(test)
-"""
diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py