diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..786a9ed --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,129 @@ +name: CI + +on: + push: + branches: + - "main" + pull_request: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + lint-python: + name: Lint Python + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + cache: "pip" + + - name: Run flake8 + uses: py-actions/flake8@v2 + + validate-compute-block: + name: Validate Compute Block Config + runs-on: ubuntu-latest + needs: lint-python + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + + - name: Intall dependencies + run: | + pip install -r requirements.txt + + - name: Check cbcs + run: | + python3 - <<'EOF' + import main + + from scystream.sdk.config import load_config, get_compute_block + from scystream.sdk.config.config_loader import _compare_configs + from pathlib import Path + + CBC_PATH = Path("cbc.yaml") + + if not CBC_PATH.exists(): + raise FileNotFoundError("cbc.yaml not found in repo root.") + + block_from_code = get_compute_block() + block_from_yaml = load_config(str(CBC_PATH)) + + _compare_configs(block_from_code, block_from_yaml) + + print("cbc.yaml matches python code definition") + EOF + + run-test: + name: Run Tests + runs-on: ubuntu-latest + needs: validate-compute-block + services: + minio: + image: lazybit/minio + ports: + - 9000:9000 + env: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + options: >- + --health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1" + --health-interval 5s + --health-retries 5 + --health-timeout 5s + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + cache: "pip" + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: Run Tests + run: pytest -vv + + build: + name: Build Docker Image + runs-on: ubuntu-latest + needs: run-test + permissions: + contents: read + packages: write + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata for docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/language-preprocessing + tags: | + type=ref, event=pr + type=raw, value=latest, enable=${{ (github.ref == format('refs/heads/{0}', 'main')) }} + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml deleted file mode 100644 index df0d4cf..0000000 --- a/.github/workflows/docker.yaml +++ /dev/null @@ -1,44 +0,0 @@ -name: Docker -on: - push: - branches: - - "main" - pull_request: - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -jobs: - build: - name: Build docker image - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - steps: - - name: Checkout Repository - uses: actions/checkout@v4 - - - name: Log in to Docker Hub - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata for docker - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/language-preprocessing - tags: | - type=ref, event=pr - type=raw, value=latest, enable=${{ (github.ref == format('refs/heads/{0}', 'main')) }} - - - name: Build and push Docker image - uses: docker/build-push-action@v5 - with: - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} diff --git a/input.bib b/input.bib new file mode 100644 index 0000000..f525305 --- /dev/null +++ b/input.bib @@ -0,0 +1,112 @@ + +@article{ WOS:001016714700004, +Author = {White, Joel}, +Title = {Theoretical and Practical Paralogisms of Digital Immortality}, +Journal = {JOURNAL OF AESTHETICS AND PHENOMENOLOGY}, +Year = {2022}, +Volume = {9}, +Number = {2, SI}, +Pages = {155-172}, +Month = {JUL 3}, +Abstract = {Modern and contemporary transhumanism has seen a recent rise in academic + and popular relevance; specific naive metaphysical ideas, such as + immortality, have returned with this rise. This article refrains from + any ethical or political assessment of transhumanism. Still, it + critiques the exact metaphysical or idealistic nature of transhumanism + and its pursuit of digital immortality: the idea that, through + technological advancements, precisely in Artificial General + Intelligence, an immortal virtual ``self{''} will become possible. The + article follows the form of Immanuel Kant's ``Paralogisms{''} from the + Critique of Pure Reason, where Kant is concerned with the substantial, + immortal nature of the soul and its experiential impossibility. The + article will offer theoretical and practical paralogisms (false logical + inferences), arguing that the transhumanist claim that digital + immortality is possible fundamentally stems from two incorrect major + premises. The first concerns the substantial nature of information, + which informs the theoretical paralogisms; the second concerns infinite + transformation (pure plasticity), which informs the practical + paralogisms}, +Publisher = {ROUTLEDGE JOURNALS, TAYLOR \& FRANCIS LTD}, +Address = {2-4 PARK SQUARE, MILTON PARK, ABINGDON OX14 4RN, OXON, ENGLAND}, +Type = {Article}, +Language = {English}, +DOI = {10.1080/20539320.2022.2150463}, +ISSN = {2053-9320}, +EISSN = {2053-9339}, +Keywords = {Transhumanism; Critical Philosophy; Immanuel Kant; Entropy; Paralogisms; + Digital Immortality}, +Research-Areas = {Philosophy}, +Web-of-Science-Categories = {Philosophy}, +Author-Email = {jhmw01@gmail.com}, +ORCID-Numbers = {White, Joel/0000-0001-6460-0564}, +Number-of-Cited-References = {30}, +Times-Cited = {0}, +Usage-Count-Last-180-days = {3}, +Usage-Count-Since-2013 = {15}, +Journal-ISO = {J. Aesthet. Phenomenol.}, +Doc-Delivery-Number = {K5GF0}, +Web-of-Science-Index = {Emerging Sources Citation Index (ESCI)}, +Unique-ID = {WOS:001016714700004}, +DA = {2025-06-26}, +} + +@article{ WOS:001322577100012, +Author = {Kant, Vivek and Khanganba, Sanjram Premjit and Dixit, Sudhir}, +Title = {Sociopolitical Challenges to Digital Transformation of Rural + Communities: Learnings from a Case Study From Manipur, India}, +Journal = {IT PROFESSIONAL}, +Year = {2024}, +Volume = {26}, +Number = {4}, +Pages = {42-47}, +Month = {JUL-AUG}, +Abstract = {The United Nations Panel on Digital Cooperation, 2019, has emphasized + the inclusive growth of digital networks and digital public goods, + utilizing a multistakeholder systems approach. Similarly, the + information and communications technology (ICT) Innovation and + Intervention Program of the Government of India's Digital North East + Vision 2022 has also emphasized a need for inclusive growth of ICT in + the Northeast Region. In line with the above, this article presents + insights from a field study conducted in the rural parts of Manipur, + India, which incidentally can be found to be applicable to many rural + parts of the developing world. The article envisions a community-driven + sociodigital transformation of the Northeast Region of India. In this + quest, the article highlights sociopolitical challenges for digital + transformation and provides insights for inclusive ICT in such + regions-infrastructure as a utility for every citizen, smart governance + and services on demand, digital empowerment of citizens, social welfare, + capacity building, and community engagement.}, +Publisher = {IEEE COMPUTER SOC}, +Address = {10662 LOS VAQUEROS CIRCLE, PO BOX 3014, LOS ALAMITOS, CA 90720-1314 USA}, +Type = {Article}, +Language = {English}, +Affiliation = {Kant, V (Corresponding Author), Indian Inst Technol Kanpur, Kanpur 208016, India. + Kant, Vivek, Indian Inst Technol Kanpur, Kanpur 208016, India. + Khanganba, Sanjram Premjit, Indian Inst Technol Indore, Indore 452020, India. + Dixit, Sudhir, Basic Internet Fdn, Oslo, Norway.}, +DOI = {10.1109/MITP.2024.3433459}, +ISSN = {1520-9202}, +EISSN = {1941-045X}, +Keywords = {Technological innovation; Digital transformation; Government; Buildings; + Asia; Africa; Information and communication technology}, +Research-Areas = {Computer Science; Telecommunications}, +Web-of-Science-Categories = {Computer Science, Information Systems; Computer Science, Software + Engineering; Telecommunications}, +Author-Email = {vkant@iitk.ac.in + sanjrampk@iiti.ac.in + sudhir.dixit@ieee.org}, +Affiliations = {Indian Institute of Technology System (IIT System); Indian Institute of + Technology (IIT) - Kanpur; Indian Institute of Technology System (IIT + System); Indian Institute of Technology (IIT) - Indore}, +ResearcherID-Numbers = {/ITU-6308-2023}, +ORCID-Numbers = {/0000-0002-6215-7500}, +Number-of-Cited-References = {7}, +Times-Cited = {0}, +Usage-Count-Last-180-days = {11}, +Usage-Count-Since-2013 = {22}, +Journal-ISO = {IT Prof.}, +Doc-Delivery-Number = {H3O9D}, +Web-of-Science-Index = {Science Citation Index Expanded (SCI-EXPANDED)}, +Unique-ID = {WOS:001322577100012}, +DA = {2025-06-26}, +} diff --git a/test/files/test.txt b/input.txt similarity index 100% rename from test/files/test.txt rename to input.txt diff --git a/main.py b/main.py index d48e02e..6698911 100644 --- a/main.py +++ b/main.py @@ -3,10 +3,10 @@ from scystream.sdk.core import entrypoint from scystream.sdk.env.settings import ( - EnvSettings, - InputSettings, - OutputSettings, - FileSettings + EnvSettings, + InputSettings, + OutputSettings, + FileSettings ) from scystream.sdk.file_handling.s3_manager import S3Operations @@ -81,7 +81,7 @@ def _preprocess_and_store(texts, settings): dtm, vocab = pre.generate_document_term_matrix() with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \ - tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab: + tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab: pickle.dump(dtm, tmp_dtm) tmp_dtm.flush() @@ -108,40 +108,3 @@ def preprocess_bib_file(settings): attribute=settings.bib_input.SELECTED_ATTRIBUTE, ) _preprocess_and_store(texts, settings) - - -""" -if __name__ == "__main__": - test = PreprocessBIB( - bib_input=BIBFileInput( - S3_HOST="http://localhost", - S3_PORT="9000", - S3_ACCESS_KEY="minioadmin", - S3_SECRET_KEY="minioadmin", - BUCKET_NAME="input-bucket", - FILE_PATH="input_file_path", - FILE_NAME="wos_export", - SELECTED_ATTRIBUTE="abstract" - ), - dtm_output=DTMFileOutput( - S3_HOST="http://localhost", - S3_PORT="9000", - S3_ACCESS_KEY="minioadmin", - S3_SECRET_KEY="minioadmin", - BUCKET_NAME="output-bucket", - FILE_PATH="output_file_path", - FILE_NAME="dtm_file_bib" - ), - vocab_output=VocabFileOutput( - S3_HOST="http://localhost", - S3_PORT="9000", - S3_ACCESS_KEY="minioadmin", - S3_SECRET_KEY="minioadmin", - BUCKET_NAME="output-bucket", - FILE_PATH="output_file_path", - FILE_NAME="vocab_file_bib" - ) - ) - - preprocess_bib_file(test) -""" diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/preprocessing/loader.py b/preprocessing/loader.py index d55aac3..9ff51c6 100644 --- a/preprocessing/loader.py +++ b/preprocessing/loader.py @@ -5,17 +5,17 @@ def normalize_text(text: str) -> str: if not text: return "" - # Remove curly braces - text = re.sub(r"[{}]", "", text) - # Remove LaTeX commands - text = re.sub(r"\\[a-zA-Z]+\s*(\{[^}]*\})?", "", text) + text = re.sub(r"\\[a-zA-Z]+\{([^}]*)\}", r"\1", text) + + text = re.sub(r"\\[a-zA-Z]+", "", text) - # Remove LaTeX escaped quotes/accents - text = re.sub(r"\\""[a-zA-Z]", lambda m: m.group(0)[-1], text) + text = re.sub(r"[{}]", "", text) + + text = re.sub(r'\\"([a-zA-Z])', r'\1', text) text = re.sub(r"\\'", "", text) - text = text.replace("'", "") + text = re.sub(r"\s+", " ", text) return text.strip() diff --git a/requirements.txt b/requirements.txt index 3493ec1..e7737db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ spacy==3.8.7 nltk==3.9.1 numpy==2.3.3 bibtexparser==1.4.3 +pytest==9.0.1 diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000..5ce5cca --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,20 @@ +import pytest + +from preprocessing.core import Preprocessor + + +@pytest.fixture +def simple_texts(): + return ["This is a test sentence.", "Another test sentence."] + + +@pytest.fixture +def preprocessor(): + return Preprocessor( + language="en", + filter_stopwords=True, + unigram_normalizer="porter", + use_ngrams=True, + ngram_min=2, + ngram_max=3, + ) diff --git a/test/files/expected_dtm_from_bib.pkl b/test/files/expected_dtm_from_bib.pkl new file mode 100644 index 0000000..8127603 Binary files /dev/null and b/test/files/expected_dtm_from_bib.pkl differ diff --git a/test/files/expected_dtm_from_txt.pkl b/test/files/expected_dtm_from_txt.pkl new file mode 100644 index 0000000..6ca1a74 Binary files /dev/null and b/test/files/expected_dtm_from_txt.pkl differ diff --git a/test/files/expected_vocab_from_bib.pkl b/test/files/expected_vocab_from_bib.pkl new file mode 100644 index 0000000..0641a70 Binary files /dev/null and b/test/files/expected_vocab_from_bib.pkl differ diff --git a/test/files/expected_vocab_from_txt.pkl b/test/files/expected_vocab_from_txt.pkl new file mode 100644 index 0000000..0698bc3 Binary files /dev/null and b/test/files/expected_vocab_from_txt.pkl differ diff --git a/test/files/input.txt b/test/files/input.txt new file mode 100644 index 0000000..1755b0f --- /dev/null +++ b/test/files/input.txt @@ -0,0 +1,4 @@ +Cats chase mice. Dogs chase cats. +Birds fly high. Cats and dogs coexist. +Mice hide from cats. Birds sing loudly. +Cats and dogs coexist. Cats and dogs coexist. diff --git a/test/test_full.py b/test/test_full.py new file mode 100644 index 0000000..c12be20 --- /dev/null +++ b/test/test_full.py @@ -0,0 +1,213 @@ +import os +import boto3 +import pytest +import pickle +import numpy as np + +from pathlib import Path +from main import preprocess_bib_file, preprocess_txt_file +from botocore.exceptions import ClientError + +MINIO_USER = "minioadmin" +MINIO_PWD = "minioadmin" +BUCKET_NAME = "testbucket" + + +def ensure_bucket(s3, bucket): + try: + s3.head_bucket(Bucket=bucket) + except ClientError as e: + error_code = e.response["Error"]["Code"] + if error_code in ("404", "NoSuchBucket"): + s3.create_bucket(Bucket=bucket) + else: + raise + + +def download_to_tmp(s3, bucket, key): + tmp_path = Path("/tmp") / key.replace("/", "_") + s3.download_file(bucket, key, str(tmp_path)) + return tmp_path + + +@pytest.fixture +def s3_minio(): + client = boto3.client( + "s3", + endpoint_url="http://localhost:9000", + aws_access_key_id=MINIO_USER, + aws_secret_access_key=MINIO_PWD + ) + ensure_bucket(client, BUCKET_NAME) + return client + + +def test_full_bib(s3_minio): + input_file_name = "input" + dtm_output_file_name = "dtm_file" + vocab_output_file_name = "vocab_file" + + bib_path = Path(__file__).parent / "files" / f"{input_file_name}.bib" + bib_bytes = bib_path.read_bytes() + + s3_minio.put_object( + Bucket=BUCKET_NAME, + Key=f"{input_file_name}.bib", + Body=bib_bytes + ) + + env = { + "bib_file_S3_HOST": "http://127.0.0.1", + "bib_file_S3_PORT": "9000", + "bib_file_S3_ACCESS_KEY": MINIO_USER, + "bib_file_S3_SECRET_KEY": MINIO_PWD, + "bib_file_BUCKET_NAME": BUCKET_NAME, + "bib_file_FILE_PATH": "", + "bib_file_FILE_NAME": input_file_name, + "bib_file_SELECTED_ATTRIBUTE": "abstract", + + "dtm_output_S3_HOST": "http://127.0.0.1", + "dtm_output_S3_PORT": "9000", + "dtm_output_S3_ACCESS_KEY": MINIO_USER, + "dtm_output_S3_SECRET_KEY": MINIO_PWD, + "dtm_output_BUCKET_NAME": BUCKET_NAME, + "dtm_output_FILE_PATH": "", + "dtm_output_FILE_NAME": dtm_output_file_name, + + "vocab_output_S3_HOST": "http://127.0.0.1", + "vocab_output_S3_PORT": "9000", + "vocab_output_S3_ACCESS_KEY": MINIO_USER, + "vocab_output_S3_SECRET_KEY": MINIO_PWD, + "vocab_output_BUCKET_NAME": BUCKET_NAME, + "vocab_output_FILE_PATH": "", + "vocab_output_FILE_NAME": vocab_output_file_name, + } + + for k, v in env.items(): + os.environ[k] = v + + preprocess_bib_file() + + keys = [ + o["Key"] + for o in s3_minio.list_objects_v2( + Bucket="testbucket").get("Contents", []) + ] + + assert f"{dtm_output_file_name}.pkl" in keys + assert f"{vocab_output_file_name}.pkl" in keys + + dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{ + dtm_output_file_name}.pkl") + vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{ + vocab_output_file_name}.pkl") + + # Load produced results + with open(dtm_path, "rb") as f: + dtm = pickle.load(f) + + with open(vocab_path, "rb") as f: + vocab = pickle.load(f) + + # Load expected snapshot files + expected_vocab_path = Path(__file__).parent / \ + "files" / "expected_vocab_from_bib.pkl" + expected_dtm_path = Path(__file__).parent / "files" / \ + "expected_dtm_from_bib.pkl" + + with open(expected_vocab_path, "rb") as f: + expected_vocab = pickle.load(f) + + with open(expected_dtm_path, "rb") as f: + expected_dtm = pickle.load(f) + + assert vocab == expected_vocab + np.testing.assert_array_equal(dtm, expected_dtm) + + +def test_full_txt(s3_minio): + input_file_name = "input" + dtm_output_file_name = "dtm_txt_file" + vocab_output_file_name = "vocab_txt_file" + + txt_path = Path(__file__).parent / "files" / f"{input_file_name}.txt" + txt_bytes = txt_path.read_bytes() + + s3_minio.put_object( + Bucket=BUCKET_NAME, + Key=f"{input_file_name}.txt", + Body=txt_bytes + ) + + env = { + "txt_file_S3_HOST": "http://127.0.0.1", + "txt_file_S3_PORT": "9000", + "txt_file_S3_ACCESS_KEY": MINIO_USER, + "txt_file_S3_SECRET_KEY": MINIO_PWD, + "txt_file_BUCKET_NAME": BUCKET_NAME, + "txt_file_FILE_PATH": "", + "txt_file_FILE_NAME": input_file_name, + + "dtm_output_S3_HOST": "http://127.0.0.1", + "dtm_output_S3_PORT": "9000", + "dtm_output_S3_ACCESS_KEY": MINIO_USER, + "dtm_output_S3_SECRET_KEY": MINIO_PWD, + "dtm_output_BUCKET_NAME": BUCKET_NAME, + "dtm_output_FILE_PATH": "", + "dtm_output_FILE_NAME": dtm_output_file_name, + + "vocab_output_S3_HOST": "http://127.0.0.1", + "vocab_output_S3_PORT": "9000", + "vocab_output_S3_ACCESS_KEY": MINIO_USER, + "vocab_output_S3_SECRET_KEY": MINIO_PWD, + "vocab_output_BUCKET_NAME": BUCKET_NAME, + "vocab_output_FILE_PATH": "", + "vocab_output_FILE_NAME": vocab_output_file_name, + } + + for k, v in env.items(): + os.environ[k] = v + + preprocess_txt_file() + + keys = [ + o["Key"] + for o in s3_minio.list_objects_v2( + Bucket=BUCKET_NAME).get("Contents", []) + ] + + assert f"{dtm_output_file_name}.pkl" in keys + assert f"{vocab_output_file_name}.pkl" in keys + + # Download produced files + dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{ + dtm_output_file_name}.pkl") + vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{ + vocab_output_file_name}.pkl") + + # Load produced results + with open(dtm_path, "rb") as f: + dtm = pickle.load(f) + + with open(vocab_path, "rb") as f: + vocab = pickle.load(f) + + # Load expected snapshot files + expected_vocab_path = Path(__file__).parent / \ + "files" / "expected_vocab_from_txt.pkl" + expected_dtm_path = Path(__file__).parent / \ + "files" / "expected_dtm_from_txt.pkl" + + with open(expected_vocab_path, "rb") as f: + expected_vocab = pickle.load(f) + + with open(expected_dtm_path, "rb") as f: + expected_dtm = pickle.load(f) + + # Assertions + assert vocab == expected_vocab + + if hasattr(dtm, "toarray"): + np.testing.assert_array_equal(dtm.toarray(), expected_dtm.toarray()) + else: + np.testing.assert_array_equal(dtm, expected_dtm) diff --git a/test/test_loaders.py b/test/test_loaders.py new file mode 100644 index 0000000..3c96468 --- /dev/null +++ b/test/test_loaders.py @@ -0,0 +1,33 @@ +import os +import tempfile + +from preprocessing.loader import TxtLoader, BibLoader + + +def test_txt_loader_reads_and_normalizes(): + with tempfile.NamedTemporaryFile("w+", delete=False) as f: + f.write("Hello {World}\nSecond line") + fname = f.name + + result = TxtLoader.load(fname) + os.unlink(fname) + + assert result == ["Hello World", "Second line"] + + +def test_bib_loader_extracts_attribute(): + bib_content = r""" + @article{a, + abstract = {This is {Bib} \textbf{text}.}, + title = {Ignore me} + } + """ + + with tempfile.NamedTemporaryFile("w+", delete=False) as f: + f.write(bib_content) + fname = f.name + + result = BibLoader.load(fname, "abstract") + os.unlink(fname) + + assert result == ["This is Bib text."] diff --git a/test/test_normalize.py b/test/test_normalize.py new file mode 100644 index 0000000..33dd6e5 --- /dev/null +++ b/test/test_normalize.py @@ -0,0 +1,17 @@ +from preprocessing.loader import normalize_text + + +def test_normalize_removes_braces(): + assert normalize_text("{abc}") == "abc" + + +def test_normalize_removes_latex_commands(): + assert normalize_text(r"\textbf{Hello}") == "Hello" + + +def test_normalize_removes_accents(): + assert normalize_text(r"\'a") == "a" + + +def test_normalize_collapses_whitespace(): + assert normalize_text("a b c") == "a b c" diff --git a/test/test_preprocessor_unit.py b/test/test_preprocessor_unit.py new file mode 100644 index 0000000..7829105 --- /dev/null +++ b/test/test_preprocessor_unit.py @@ -0,0 +1,26 @@ +def test_preprocessor_tokenization(preprocessor, simple_texts): + preprocessor.texts = simple_texts + preprocessor.analyze_texts() + + assert len(preprocessor.token_frequency) > 0 + + +def test_preprocessor_bag_of_words(preprocessor, simple_texts): + preprocessor.texts = simple_texts + preprocessor.analyze_texts() + preprocessor.generate_bag_of_words() + + assert len(preprocessor.bag_of_words) == 2 + assert all(len(doc) > 0 for doc in preprocessor.bag_of_words) + + +def test_generate_document_term_matrix(preprocessor, simple_texts): + preprocessor.texts = simple_texts + preprocessor.analyze_texts() + preprocessor.generate_bag_of_words() + + dtm, vocab = preprocessor.generate_document_term_matrix() + + assert dtm.shape[0] == 2 + assert dtm.shape[1] == len(vocab) + assert dtm.sum() > 0