diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 0000000..786a9ed
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,129 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - "main"
+  pull_request:
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  lint-python:
+    name: Lint Python
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          cache: "pip"
+
+      - name: Run flake8
+        uses: py-actions/flake8@v2
+
+  validate-compute-block:
+    name: Validate Compute Block Config
+    runs-on: ubuntu-latest
+    needs: lint-python
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+
+      - name: Intall dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Check cbcs
+        run: |
+          python3 - <<'EOF'
+          import main
+
+          from scystream.sdk.config import load_config, get_compute_block
+          from scystream.sdk.config.config_loader import _compare_configs
+          from pathlib import Path
+
+          CBC_PATH = Path("cbc.yaml")
+
+          if not CBC_PATH.exists():
+              raise FileNotFoundError("cbc.yaml not found in repo root.")
+
+          block_from_code = get_compute_block()
+          block_from_yaml = load_config(str(CBC_PATH))
+
+          _compare_configs(block_from_code, block_from_yaml)
+
+          print("cbc.yaml matches python code definition")
+          EOF
+
+  run-test:
+    name: Run Tests
+    runs-on: ubuntu-latest
+    needs: validate-compute-block
+    services:
+      minio:
+        image: lazybit/minio
+        ports:
+          - 9000:9000
+        env:
+          MINIO_ROOT_USER: minioadmin
+          MINIO_ROOT_PASSWORD: minioadmin
+        options: >-
+          --health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1"
+          --health-interval 5s
+          --health-retries 5
+          --health-timeout 5s
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Run Tests
+        run: pytest -vv
+
+  build:
+    name: Build Docker Image
+    runs-on: ubuntu-latest
+    needs: run-test
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata for docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/language-preprocessing
+          tags: |
+            type=ref, event=pr
+            type=raw, value=latest, enable=${{ (github.ref == format('refs/heads/{0}', 'main')) }}
+      
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
deleted file mode 100644
index df0d4cf..0000000
--- a/.github/workflows/docker.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: Docker
-on:
-  push:
-    branches:
-      - "main"
-  pull_request:
-
-env:
-  REGISTRY: ghcr.io
-  IMAGE_NAME: ${{ github.repository }}
-
-jobs:
-  build:
-    name: Build docker image
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-    steps:
-      - name: Checkout Repository
-        uses: actions/checkout@v4
-
-      - name: Log in to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          registry: ${{ env.REGISTRY }}
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Extract metadata for docker
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/language-preprocessing
-          tags: |
-            type=ref, event=pr
-            type=raw, value=latest, enable=${{ (github.ref == format('refs/heads/{0}', 'main')) }}
-      
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
-        with:
-          push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
diff --git a/input.bib b/input.bib
new file mode 100644
index 0000000..f525305
--- /dev/null
+++ b/input.bib
@@ -0,0 +1,112 @@
+﻿
+@article{ WOS:001016714700004,
+Author = {White, Joel},
+Title = {Theoretical and Practical Paralogisms of Digital Immortality},
+Journal = {JOURNAL OF AESTHETICS AND PHENOMENOLOGY},
+Year = {2022},
+Volume = {9},
+Number = {2, SI},
+Pages = {155-172},
+Month = {JUL 3},
+Abstract = {Modern and contemporary transhumanism has seen a recent rise in academic
+   and popular relevance; specific naive metaphysical ideas, such as
+   immortality, have returned with this rise. This article refrains from
+   any ethical or political assessment of transhumanism. Still, it
+   critiques the exact metaphysical or idealistic nature of transhumanism
+   and its pursuit of digital immortality: the idea that, through
+   technological advancements, precisely in Artificial General
+   Intelligence, an immortal virtual ``self{''} will become possible. The
+   article follows the form of Immanuel Kant's ``Paralogisms{''} from the
+   Critique of Pure Reason, where Kant is concerned with the substantial,
+   immortal nature of the soul and its experiential impossibility. The
+   article will offer theoretical and practical paralogisms (false logical
+   inferences), arguing that the transhumanist claim that digital
+   immortality is possible fundamentally stems from two incorrect major
+   premises. The first concerns the substantial nature of information,
+   which informs the theoretical paralogisms; the second concerns infinite
+   transformation (pure plasticity), which informs the practical
+   paralogisms},
+Publisher = {ROUTLEDGE JOURNALS, TAYLOR \& FRANCIS LTD},
+Address = {2-4 PARK SQUARE, MILTON PARK, ABINGDON OX14 4RN, OXON, ENGLAND},
+Type = {Article},
+Language = {English},
+DOI = {10.1080/20539320.2022.2150463},
+ISSN = {2053-9320},
+EISSN = {2053-9339},
+Keywords = {Transhumanism; Critical Philosophy; Immanuel Kant; Entropy; Paralogisms;
+   Digital Immortality},
+Research-Areas = {Philosophy},
+Web-of-Science-Categories  = {Philosophy},
+Author-Email = {jhmw01@gmail.com},
+ORCID-Numbers = {White, Joel/0000-0001-6460-0564},
+Number-of-Cited-References = {30},
+Times-Cited = {0},
+Usage-Count-Last-180-days = {3},
+Usage-Count-Since-2013 = {15},
+Journal-ISO = {J. Aesthet. Phenomenol.},
+Doc-Delivery-Number = {K5GF0},
+Web-of-Science-Index = {Emerging Sources Citation Index (ESCI)},
+Unique-ID = {WOS:001016714700004},
+DA = {2025-06-26},
+}
+
+@article{ WOS:001322577100012,
+Author = {Kant, Vivek and Khanganba, Sanjram Premjit and Dixit, Sudhir},
+Title = {Sociopolitical Challenges to Digital Transformation of Rural
+   Communities: Learnings from a Case Study From Manipur, India},
+Journal = {IT PROFESSIONAL},
+Year = {2024},
+Volume = {26},
+Number = {4},
+Pages = {42-47},
+Month = {JUL-AUG},
+Abstract = {The United Nations Panel on Digital Cooperation, 2019, has emphasized
+   the inclusive growth of digital networks and digital public goods,
+   utilizing a multistakeholder systems approach. Similarly, the
+   information and communications technology (ICT) Innovation and
+   Intervention Program of the Government of India's Digital North East
+   Vision 2022 has also emphasized a need for inclusive growth of ICT in
+   the Northeast Region. In line with the above, this article presents
+   insights from a field study conducted in the rural parts of Manipur,
+   India, which incidentally can be found to be applicable to many rural
+   parts of the developing world. The article envisions a community-driven
+   sociodigital transformation of the Northeast Region of India. In this
+   quest, the article highlights sociopolitical challenges for digital
+   transformation and provides insights for inclusive ICT in such
+   regions-infrastructure as a utility for every citizen, smart governance
+   and services on demand, digital empowerment of citizens, social welfare,
+   capacity building, and community engagement.},
+Publisher = {IEEE COMPUTER SOC},
+Address = {10662 LOS VAQUEROS CIRCLE, PO BOX 3014, LOS ALAMITOS, CA 90720-1314 USA},
+Type = {Article},
+Language = {English},
+Affiliation = {Kant, V (Corresponding Author), Indian Inst Technol Kanpur, Kanpur 208016, India.
+   Kant, Vivek, Indian Inst Technol Kanpur, Kanpur 208016, India.
+   Khanganba, Sanjram Premjit, Indian Inst Technol Indore, Indore 452020, India.
+   Dixit, Sudhir, Basic Internet Fdn, Oslo, Norway.},
+DOI = {10.1109/MITP.2024.3433459},
+ISSN = {1520-9202},
+EISSN = {1941-045X},
+Keywords = {Technological innovation; Digital transformation; Government; Buildings;
+   Asia; Africa; Information and communication technology},
+Research-Areas = {Computer Science; Telecommunications},
+Web-of-Science-Categories  = {Computer Science, Information Systems; Computer Science, Software
+   Engineering; Telecommunications},
+Author-Email = {vkant@iitk.ac.in
+   sanjrampk@iiti.ac.in
+   sudhir.dixit@ieee.org},
+Affiliations = {Indian Institute of Technology System (IIT System); Indian Institute of
+   Technology (IIT) - Kanpur; Indian Institute of Technology System (IIT
+   System); Indian Institute of Technology (IIT) - Indore},
+ResearcherID-Numbers = {/ITU-6308-2023},
+ORCID-Numbers = {/0000-0002-6215-7500},
+Number-of-Cited-References = {7},
+Times-Cited = {0},
+Usage-Count-Last-180-days = {11},
+Usage-Count-Since-2013 = {22},
+Journal-ISO = {IT Prof.},
+Doc-Delivery-Number = {H3O9D},
+Web-of-Science-Index = {Science Citation Index Expanded (SCI-EXPANDED)},
+Unique-ID = {WOS:001322577100012},
+DA = {2025-06-26},
+}
diff --git a/test/files/test.txt b/input.txt
similarity index 100%
rename from test/files/test.txt
rename to input.txt
diff --git a/main.py b/main.py
index d48e02e..6698911 100644
--- a/main.py
+++ b/main.py
@@ -3,10 +3,10 @@
 
 from scystream.sdk.core import entrypoint
 from scystream.sdk.env.settings import (
-        EnvSettings,
-        InputSettings,
-        OutputSettings,
-        FileSettings
+    EnvSettings,
+    InputSettings,
+    OutputSettings,
+    FileSettings
 )
 from scystream.sdk.file_handling.s3_manager import S3Operations
 
@@ -81,7 +81,7 @@ def _preprocess_and_store(texts, settings):
     dtm, vocab = pre.generate_document_term_matrix()
 
     with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
-         tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
+            tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
 
         pickle.dump(dtm, tmp_dtm)
         tmp_dtm.flush()
@@ -108,40 +108,3 @@ def preprocess_bib_file(settings):
         attribute=settings.bib_input.SELECTED_ATTRIBUTE,
     )
     _preprocess_and_store(texts, settings)
-
-
-"""
-if __name__ == "__main__":
-    test = PreprocessBIB(
-        bib_input=BIBFileInput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="input-bucket",
-            FILE_PATH="input_file_path",
-            FILE_NAME="wos_export",
-            SELECTED_ATTRIBUTE="abstract"
-        ),
-        dtm_output=DTMFileOutput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="output-bucket",
-            FILE_PATH="output_file_path",
-            FILE_NAME="dtm_file_bib"
-        ),
-        vocab_output=VocabFileOutput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="output-bucket",
-            FILE_PATH="output_file_path",
-            FILE_NAME="vocab_file_bib"
-        )
-    )
-
-    preprocess_bib_file(test)
-"""
diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/preprocessing/loader.py b/preprocessing/loader.py
index d55aac3..9ff51c6 100644
--- a/preprocessing/loader.py
+++ b/preprocessing/loader.py
@@ -5,17 +5,17 @@
 def normalize_text(text: str) -> str:
     if not text:
         return ""
-    # Remove curly braces
-    text = re.sub(r"[{}]", "", text)
 
-    # Remove LaTeX commands
-    text = re.sub(r"\\[a-zA-Z]+\s*(\{[^}]*\})?", "", text)
+    text = re.sub(r"\\[a-zA-Z]+\{([^}]*)\}", r"\1", text)
+
+    text = re.sub(r"\\[a-zA-Z]+", "", text)
 
-    # Remove LaTeX escaped quotes/accents
-    text = re.sub(r"\\""[a-zA-Z]", lambda m: m.group(0)[-1], text)
+    text = re.sub(r"[{}]", "", text)
+
+    text = re.sub(r'\\"([a-zA-Z])', r'\1', text)
 
     text = re.sub(r"\\'", "", text)
-    text = text.replace("'", "")
+
     text = re.sub(r"\s+", " ", text)
 
     return text.strip()
diff --git a/requirements.txt b/requirements.txt
index 3493ec1..e7737db 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ spacy==3.8.7
 nltk==3.9.1
 numpy==2.3.3
 bibtexparser==1.4.3
+pytest==9.0.1
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 0000000..5ce5cca
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,20 @@
+import pytest
+
+from preprocessing.core import Preprocessor
+
+
+@pytest.fixture
+def simple_texts():
+    return ["This is a test sentence.", "Another test sentence."]
+
+
+@pytest.fixture
+def preprocessor():
+    return Preprocessor(
+        language="en",
+        filter_stopwords=True,
+        unigram_normalizer="porter",
+        use_ngrams=True,
+        ngram_min=2,
+        ngram_max=3,
+    )
diff --git a/test/files/expected_dtm_from_bib.pkl b/test/files/expected_dtm_from_bib.pkl
new file mode 100644
index 0000000..8127603
Binary files /dev/null and b/test/files/expected_dtm_from_bib.pkl differ
diff --git a/test/files/expected_dtm_from_txt.pkl b/test/files/expected_dtm_from_txt.pkl
new file mode 100644
index 0000000..6ca1a74
Binary files /dev/null and b/test/files/expected_dtm_from_txt.pkl differ
diff --git a/test/files/expected_vocab_from_bib.pkl b/test/files/expected_vocab_from_bib.pkl
new file mode 100644
index 0000000..0641a70
Binary files /dev/null and b/test/files/expected_vocab_from_bib.pkl differ
diff --git a/test/files/expected_vocab_from_txt.pkl b/test/files/expected_vocab_from_txt.pkl
new file mode 100644
index 0000000..0698bc3
Binary files /dev/null and b/test/files/expected_vocab_from_txt.pkl differ
diff --git a/test/files/input.txt b/test/files/input.txt
new file mode 100644
index 0000000..1755b0f
--- /dev/null
+++ b/test/files/input.txt
@@ -0,0 +1,4 @@
+Cats chase mice. Dogs chase cats.
+Birds fly high. Cats and dogs coexist.
+Mice hide from cats. Birds sing loudly.
+Cats and dogs coexist. Cats and dogs coexist.
diff --git a/test/test_full.py b/test/test_full.py
new file mode 100644
index 0000000..c12be20
--- /dev/null
+++ b/test/test_full.py
@@ -0,0 +1,213 @@
+import os
+import boto3
+import pytest
+import pickle
+import numpy as np
+
+from pathlib import Path
+from main import preprocess_bib_file, preprocess_txt_file
+from botocore.exceptions import ClientError
+
+MINIO_USER = "minioadmin"
+MINIO_PWD = "minioadmin"
+BUCKET_NAME = "testbucket"
+
+
+def ensure_bucket(s3, bucket):
+    try:
+        s3.head_bucket(Bucket=bucket)
+    except ClientError as e:
+        error_code = e.response["Error"]["Code"]
+        if error_code in ("404", "NoSuchBucket"):
+            s3.create_bucket(Bucket=bucket)
+        else:
+            raise
+
+
+def download_to_tmp(s3, bucket, key):
+    tmp_path = Path("/tmp") / key.replace("/", "_")
+    s3.download_file(bucket, key, str(tmp_path))
+    return tmp_path
+
+
+@pytest.fixture
+def s3_minio():
+    client = boto3.client(
+        "s3",
+        endpoint_url="http://localhost:9000",
+        aws_access_key_id=MINIO_USER,
+        aws_secret_access_key=MINIO_PWD
+    )
+    ensure_bucket(client, BUCKET_NAME)
+    return client
+
+
+def test_full_bib(s3_minio):
+    input_file_name = "input"
+    dtm_output_file_name = "dtm_file"
+    vocab_output_file_name = "vocab_file"
+
+    bib_path = Path(__file__).parent / "files" / f"{input_file_name}.bib"
+    bib_bytes = bib_path.read_bytes()
+
+    s3_minio.put_object(
+        Bucket=BUCKET_NAME,
+        Key=f"{input_file_name}.bib",
+        Body=bib_bytes
+    )
+
+    env = {
+        "bib_file_S3_HOST": "http://127.0.0.1",
+        "bib_file_S3_PORT": "9000",
+        "bib_file_S3_ACCESS_KEY": MINIO_USER,
+        "bib_file_S3_SECRET_KEY": MINIO_PWD,
+        "bib_file_BUCKET_NAME": BUCKET_NAME,
+        "bib_file_FILE_PATH": "",
+        "bib_file_FILE_NAME": input_file_name,
+        "bib_file_SELECTED_ATTRIBUTE": "abstract",
+
+        "dtm_output_S3_HOST": "http://127.0.0.1",
+        "dtm_output_S3_PORT": "9000",
+        "dtm_output_S3_ACCESS_KEY": MINIO_USER,
+        "dtm_output_S3_SECRET_KEY": MINIO_PWD,
+        "dtm_output_BUCKET_NAME":  BUCKET_NAME,
+        "dtm_output_FILE_PATH": "",
+        "dtm_output_FILE_NAME": dtm_output_file_name,
+
+        "vocab_output_S3_HOST": "http://127.0.0.1",
+        "vocab_output_S3_PORT": "9000",
+        "vocab_output_S3_ACCESS_KEY": MINIO_USER,
+        "vocab_output_S3_SECRET_KEY": MINIO_PWD,
+        "vocab_output_BUCKET_NAME": BUCKET_NAME,
+        "vocab_output_FILE_PATH": "",
+        "vocab_output_FILE_NAME": vocab_output_file_name,
+    }
+
+    for k, v in env.items():
+        os.environ[k] = v
+
+    preprocess_bib_file()
+
+    keys = [
+        o["Key"]
+        for o in s3_minio.list_objects_v2(
+            Bucket="testbucket").get("Contents", [])
+    ]
+
+    assert f"{dtm_output_file_name}.pkl" in keys
+    assert f"{vocab_output_file_name}.pkl" in keys
+
+    dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+        dtm_output_file_name}.pkl")
+    vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+        vocab_output_file_name}.pkl")
+
+    # Load produced results
+    with open(dtm_path, "rb") as f:
+        dtm = pickle.load(f)
+
+    with open(vocab_path, "rb") as f:
+        vocab = pickle.load(f)
+
+    # Load expected snapshot files
+    expected_vocab_path = Path(__file__).parent / \
+        "files" / "expected_vocab_from_bib.pkl"
+    expected_dtm_path = Path(__file__).parent / "files" / \
+        "expected_dtm_from_bib.pkl"
+
+    with open(expected_vocab_path, "rb") as f:
+        expected_vocab = pickle.load(f)
+
+    with open(expected_dtm_path, "rb") as f:
+        expected_dtm = pickle.load(f)
+
+    assert vocab == expected_vocab
+    np.testing.assert_array_equal(dtm, expected_dtm)
+
+
+def test_full_txt(s3_minio):
+    input_file_name = "input"
+    dtm_output_file_name = "dtm_txt_file"
+    vocab_output_file_name = "vocab_txt_file"
+
+    txt_path = Path(__file__).parent / "files" / f"{input_file_name}.txt"
+    txt_bytes = txt_path.read_bytes()
+
+    s3_minio.put_object(
+        Bucket=BUCKET_NAME,
+        Key=f"{input_file_name}.txt",
+        Body=txt_bytes
+    )
+
+    env = {
+        "txt_file_S3_HOST": "http://127.0.0.1",
+        "txt_file_S3_PORT": "9000",
+        "txt_file_S3_ACCESS_KEY": MINIO_USER,
+        "txt_file_S3_SECRET_KEY": MINIO_PWD,
+        "txt_file_BUCKET_NAME": BUCKET_NAME,
+        "txt_file_FILE_PATH": "",
+        "txt_file_FILE_NAME": input_file_name,
+
+        "dtm_output_S3_HOST": "http://127.0.0.1",
+        "dtm_output_S3_PORT": "9000",
+        "dtm_output_S3_ACCESS_KEY": MINIO_USER,
+        "dtm_output_S3_SECRET_KEY": MINIO_PWD,
+        "dtm_output_BUCKET_NAME": BUCKET_NAME,
+        "dtm_output_FILE_PATH": "",
+        "dtm_output_FILE_NAME": dtm_output_file_name,
+
+        "vocab_output_S3_HOST": "http://127.0.0.1",
+        "vocab_output_S3_PORT": "9000",
+        "vocab_output_S3_ACCESS_KEY": MINIO_USER,
+        "vocab_output_S3_SECRET_KEY": MINIO_PWD,
+        "vocab_output_BUCKET_NAME": BUCKET_NAME,
+        "vocab_output_FILE_PATH": "",
+        "vocab_output_FILE_NAME": vocab_output_file_name,
+    }
+
+    for k, v in env.items():
+        os.environ[k] = v
+
+    preprocess_txt_file()
+
+    keys = [
+        o["Key"]
+        for o in s3_minio.list_objects_v2(
+            Bucket=BUCKET_NAME).get("Contents", [])
+    ]
+
+    assert f"{dtm_output_file_name}.pkl" in keys
+    assert f"{vocab_output_file_name}.pkl" in keys
+
+    # Download produced files
+    dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+                               dtm_output_file_name}.pkl")
+    vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+                                 vocab_output_file_name}.pkl")
+
+    # Load produced results
+    with open(dtm_path, "rb") as f:
+        dtm = pickle.load(f)
+
+    with open(vocab_path, "rb") as f:
+        vocab = pickle.load(f)
+
+    # Load expected snapshot files
+    expected_vocab_path = Path(__file__).parent / \
+        "files" / "expected_vocab_from_txt.pkl"
+    expected_dtm_path = Path(__file__).parent / \
+        "files" / "expected_dtm_from_txt.pkl"
+
+    with open(expected_vocab_path, "rb") as f:
+        expected_vocab = pickle.load(f)
+
+    with open(expected_dtm_path, "rb") as f:
+        expected_dtm = pickle.load(f)
+
+    # Assertions
+    assert vocab == expected_vocab
+
+    if hasattr(dtm, "toarray"):
+        np.testing.assert_array_equal(dtm.toarray(), expected_dtm.toarray())
+    else:
+        np.testing.assert_array_equal(dtm, expected_dtm)
diff --git a/test/test_loaders.py b/test/test_loaders.py
new file mode 100644
index 0000000..3c96468
--- /dev/null
+++ b/test/test_loaders.py
@@ -0,0 +1,33 @@
+import os
+import tempfile
+
+from preprocessing.loader import TxtLoader, BibLoader
+
+
+def test_txt_loader_reads_and_normalizes():
+    with tempfile.NamedTemporaryFile("w+", delete=False) as f:
+        f.write("Hello {World}\nSecond line")
+        fname = f.name
+
+    result = TxtLoader.load(fname)
+    os.unlink(fname)
+
+    assert result == ["Hello World", "Second line"]
+
+
+def test_bib_loader_extracts_attribute():
+    bib_content = r"""
+    @article{a,
+      abstract = {This is {Bib} \textbf{text}.},
+      title = {Ignore me}
+    }
+    """
+
+    with tempfile.NamedTemporaryFile("w+", delete=False) as f:
+        f.write(bib_content)
+        fname = f.name
+
+    result = BibLoader.load(fname, "abstract")
+    os.unlink(fname)
+
+    assert result == ["This is Bib text."]
diff --git a/test/test_normalize.py b/test/test_normalize.py
new file mode 100644
index 0000000..33dd6e5
--- /dev/null
+++ b/test/test_normalize.py
@@ -0,0 +1,17 @@
+from preprocessing.loader import normalize_text
+
+
+def test_normalize_removes_braces():
+    assert normalize_text("{abc}") == "abc"
+
+
+def test_normalize_removes_latex_commands():
+    assert normalize_text(r"\textbf{Hello}") == "Hello"
+
+
+def test_normalize_removes_accents():
+    assert normalize_text(r"\'a") == "a"
+
+
+def test_normalize_collapses_whitespace():
+    assert normalize_text("a    b   c") == "a b c"
diff --git a/test/test_preprocessor_unit.py b/test/test_preprocessor_unit.py
new file mode 100644
index 0000000..7829105
--- /dev/null
+++ b/test/test_preprocessor_unit.py
@@ -0,0 +1,26 @@
+def test_preprocessor_tokenization(preprocessor, simple_texts):
+    preprocessor.texts = simple_texts
+    preprocessor.analyze_texts()
+
+    assert len(preprocessor.token_frequency) > 0
+
+
+def test_preprocessor_bag_of_words(preprocessor, simple_texts):
+    preprocessor.texts = simple_texts
+    preprocessor.analyze_texts()
+    preprocessor.generate_bag_of_words()
+
+    assert len(preprocessor.bag_of_words) == 2
+    assert all(len(doc) > 0 for doc in preprocessor.bag_of_words)
+
+
+def test_generate_document_term_matrix(preprocessor, simple_texts):
+    preprocessor.texts = simple_texts
+    preprocessor.analyze_texts()
+    preprocessor.generate_bag_of_words()
+
+    dtm, vocab = preprocessor.generate_document_term_matrix()
+
+    assert dtm.shape[0] == 2
+    assert dtm.shape[1] == len(vocab)
+    assert dtm.sum() > 0