Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
name: CI

on:
push:
branches:
- "main"
pull_request:

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

jobs:
lint-python:
name: Lint Python
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
cache: "pip"

- name: Run flake8
uses: py-actions/flake8@v2

validate-compute-block:
name: Validate Compute Block Config
runs-on: ubuntu-latest
needs: lint-python
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5

- name: Intall dependencies
run: |
pip install -r requirements.txt

- name: Check cbcs
run: |
python3 - <<'EOF'
import main

from scystream.sdk.config import load_config, get_compute_block
from scystream.sdk.config.config_loader import _compare_configs
from pathlib import Path

CBC_PATH = Path("cbc.yaml")

if not CBC_PATH.exists():
raise FileNotFoundError("cbc.yaml not found in repo root.")

block_from_code = get_compute_block()
block_from_yaml = load_config(str(CBC_PATH))

_compare_configs(block_from_code, block_from_yaml)

print("cbc.yaml matches python code definition")
EOF

run-test:
name: Run Tests
runs-on: ubuntu-latest
needs: validate-compute-block
services:
minio:
image: lazybit/minio
ports:
- 9000:9000
env:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
options: >-
--health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1"
--health-interval 5s
--health-retries 5
--health-timeout 5s
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
cache: "pip"

- name: Install dependencies
run: |
pip install -r requirements.txt

- name: Run Tests
run: pytest -vv

build:
name: Build Docker Image
runs-on: ubuntu-latest
needs: run-test
permissions:
contents: read
packages: write
steps:
- name: Checkout Repository
uses: actions/checkout@v4

- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata for docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/language-preprocessing
tags: |
type=ref, event=pr
type=raw, value=latest, enable=${{ (github.ref == format('refs/heads/{0}', 'main')) }}

- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

44 changes: 0 additions & 44 deletions .github/workflows/docker.yaml

This file was deleted.

16 changes: 9 additions & 7 deletions cbc.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
author: Paul Kalhorn
description: Language preprocessing for .txt or .bib files
docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
entrypoints:
preprocess_bib_file:
description: Entrypoint for preprocessing a .bib file
description: Entrypoint for preprocessing a .bib file
envs:
BIB_DOWNLOAD_PATH: /tmp/input.bib
FILTER_STOPWORDS: true
LANGUAGE: en
NGRAM_MAX: 3
NGRAM_MIN: 2
UNIGRAM_NORMALIZER: porter
UNIGRAM_NORMALIZER: lemma
USE_NGRAMS: true
inputs:
bib_input:
Expand All @@ -23,7 +24,7 @@ entrypoints:
bib_file_S3_PORT: null
bib_file_S3_SECRET_KEY: null
bib_file_SELECTED_ATTRIBUTE: Abstract
description: The bib file, aswell as one attribute selected for preprocessing
description: The bib file, aswell as one attribute selected for preprocessing
type: file
outputs:
dtm_output:
Expand All @@ -36,7 +37,7 @@ entrypoints:
dtm_output_S3_HOST: null
dtm_output_S3_PORT: null
dtm_output_S3_SECRET_KEY: null
description: Numpy representation of document-term matrix as .pkl file
description: Numpy representation of document-term matrix as .pkl file
type: file
vocab_output:
config:
Expand All @@ -57,7 +58,8 @@ entrypoints:
LANGUAGE: en
NGRAM_MAX: 3
NGRAM_MIN: 2
UNIGRAM_NORMALIZER: porter
TXT_DOWNLOAD_PATH: /tmp/input.txt
UNIGRAM_NORMALIZER: lemma
USE_NGRAMS: true
inputs:
txt_input:
Expand All @@ -70,7 +72,7 @@ entrypoints:
txt_file_S3_HOST: null
txt_file_S3_PORT: null
txt_file_S3_SECRET_KEY: null
description: A .txt file
description: A .txt file
type: file
outputs:
dtm_output:
Expand Down
82 changes: 35 additions & 47 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
import pickle
import tempfile
import logging

from scystream.sdk.core import entrypoint
from scystream.sdk.env.settings import (
EnvSettings,
InputSettings,
OutputSettings,
FileSettings
EnvSettings,
InputSettings,
OutputSettings,
FileSettings
)
from scystream.sdk.file_handling.s3_manager import S3Operations

from preprocessing.core import Preprocessor
from preprocessing.loader import TxtLoader, BibLoader

logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class DTMFileOutput(FileSettings, OutputSettings):
__identifier__ = "dtm_output"
Expand Down Expand Up @@ -46,6 +53,8 @@ class PreprocessTXT(EnvSettings):
NGRAM_MIN: int = 2
NGRAM_MAX: int = 3

TXT_DOWNLOAD_PATH: str = "/tmp/input.txt"

txt_input: TXTFileInput
dtm_output: DTMFileOutput
vocab_output: VocabFileOutput
Expand All @@ -59,13 +68,17 @@ class PreprocessBIB(EnvSettings):
NGRAM_MIN: int = 2
NGRAM_MAX: int = 3

BIB_DOWNLOAD_PATH: str = "/tmp/input.bib"

bib_input: BIBFileInput
dtm_output: DTMFileOutput
vocab_output: VocabFileOutput


def _preprocess_and_store(texts, settings):
"""Shared preprocessing logic for TXT and BIB."""
logger.info(f"Starting preprocessing with {len(texts)} documents")

pre = Preprocessor(
language=settings.LANGUAGE,
filter_stopwords=settings.FILTER_STOPWORDS,
Expand All @@ -74,74 +87,49 @@ def _preprocess_and_store(texts, settings):
ngram_min=settings.NGRAM_MIN,
ngram_max=settings.NGRAM_MAX,
)
pre.texts = texts

pre.texts = texts
pre.analyze_texts()

pre.generate_bag_of_words()

dtm, vocab = pre.generate_document_term_matrix()

with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:

pickle.dump(dtm, tmp_dtm)
tmp_dtm.flush()

pickle.dump(vocab, tmp_vocab)
tmp_vocab.flush()

logger.info("Uploading DTM to S3...")
S3Operations.upload(settings.dtm_output, tmp_dtm.name)

logger.info("Uploading vocabulary to S3...")
S3Operations.upload(settings.vocab_output, tmp_vocab.name)

logger.info("Preprocessing completed successfully.")


@entrypoint(PreprocessTXT)
def preprocess_txt_file(settings):
S3Operations.download(settings.txt_input, "input.txt")
texts = TxtLoader.load("./input.txt")
logger.info("Downloading TXT input from S3...")
S3Operations.download(settings.txt_input, settings.TXT_DOWNLOAD_PATH)

texts = TxtLoader.load(settings.TXT_DOWNLOAD_PATH)

_preprocess_and_store(texts, settings)


@entrypoint(PreprocessBIB)
def preprocess_bib_file(settings):
S3Operations.download(settings.bib_input, "input.bib")
logger.info("Downloading BIB input from S3...")
S3Operations.download(settings.bib_input, settings.BIB_DOWNLOAD_PATH)

texts = BibLoader.load(
"./input.bib",
settings.BIB_DOWNLOAD_PATH,
attribute=settings.bib_input.SELECTED_ATTRIBUTE,
)
_preprocess_and_store(texts, settings)


"""
if __name__ == "__main__":
test = PreprocessBIB(
bib_input=BIBFileInput(
S3_HOST="http://localhost",
S3_PORT="9000",
S3_ACCESS_KEY="minioadmin",
S3_SECRET_KEY="minioadmin",
BUCKET_NAME="input-bucket",
FILE_PATH="input_file_path",
FILE_NAME="wos_export",
SELECTED_ATTRIBUTE="abstract"
),
dtm_output=DTMFileOutput(
S3_HOST="http://localhost",
S3_PORT="9000",
S3_ACCESS_KEY="minioadmin",
S3_SECRET_KEY="minioadmin",
BUCKET_NAME="output-bucket",
FILE_PATH="output_file_path",
FILE_NAME="dtm_file_bib"
),
vocab_output=VocabFileOutput(
S3_HOST="http://localhost",
S3_PORT="9000",
S3_ACCESS_KEY="minioadmin",
S3_SECRET_KEY="minioadmin",
BUCKET_NAME="output-bucket",
FILE_PATH="output_file_path",
FILE_NAME="vocab_file_bib"
)
)

preprocess_bib_file(test)
"""
Empty file added preprocessing/__init__.py
Empty file.
Loading
Loading