From b5874cc0cbd62add13c36ac57ec7ccf18bd13dd5 Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Mon, 1 Dec 2025 21:26:58 -0600
Subject: [PATCH 1/7] chore: add tests

---
 .github/workflows/{docker.yaml => ci.yaml} |  50 ++++-
 input.bib                                  | 112 +++++++++++
 test/files/test.txt => input.txt           |   0
 main.py                                    |  47 +----
 preprocessing/__init__.py                  |   0
 preprocessing/loader.py                    |  14 +-
 requirements.txt                           |   1 +
 test/__init__.py                           |   0
 test/conftest.py                           |  20 ++
 test/files/expected_dtm_from_bib.pkl       | Bin 0 -> 7546 bytes
 test/files/expected_dtm_from_txt.pkl       | Bin 0 -> 1081 bytes
 test/files/expected_vocab_from_bib.pkl     | Bin 0 -> 9047 bytes
 test/files/expected_vocab_from_txt.pkl     | Bin 0 -> 400 bytes
 test/files/input.txt                       |   4 +
 test/test_full.py                          | 213 +++++++++++++++++++++
 test/test_loaders.py                       |  33 ++++
 test/test_normalize.py                     |  17 ++
 test/test_preprocessor_unit.py             |  26 +++
 18 files changed, 487 insertions(+), 50 deletions(-)
 rename .github/workflows/{docker.yaml => ci.yaml} (52%)
 create mode 100644 input.bib
 rename test/files/test.txt => input.txt (100%)
 create mode 100644 preprocessing/__init__.py
 create mode 100644 test/__init__.py
 create mode 100644 test/conftest.py
 create mode 100644 test/files/expected_dtm_from_bib.pkl
 create mode 100644 test/files/expected_dtm_from_txt.pkl
 create mode 100644 test/files/expected_vocab_from_bib.pkl
 create mode 100644 test/files/expected_vocab_from_txt.pkl
 create mode 100644 test/files/input.txt
 create mode 100644 test/test_full.py
 create mode 100644 test/test_loaders.py
 create mode 100644 test/test_normalize.py
 create mode 100644 test/test_preprocessor_unit.py

diff --git a/.github/workflows/docker.yaml b/.github/workflows/ci.yaml
similarity index 52%
rename from .github/workflows/docker.yaml
rename to .github/workflows/ci.yaml
index df0d4cf..3939887 100644
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/ci.yaml
@@ -1,4 +1,5 @@
-name: Docker
+name: CI
+
 on:
   push:
     branches:
@@ -10,9 +11,55 @@ env:
   IMAGE_NAME: ${{ github.repository }}
 
 jobs:
+  lint-python:
+    name: Lint Python
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          cache: "pip"
+
+      - name: Run flake8
+        uses: py-actions/flake8@v2
+
+  test:
+    runs-on: ubuntu-latest
+    needs: lint-python
+    services:
+      minio:
+        image: lazybit/minio
+        ports:
+          - 9000:9000
+        env:
+          MINIO_ROOT_USER: minioadmin
+          MINIO_ROOT_PASSWORD: minioadmin
+        options: >-
+          --health-cmd "curl -f http://localhost:9000/minio/health/live || exit 1"
+          --health-interval 5s
+          --health-retries 5
+          --health-timeout 5s
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Run Tests
+        run: pytest -vv
+
   build:
     name: Build docker image
     runs-on: ubuntu-latest
+    needs: test
     permissions:
       contents: read
       packages: write
@@ -42,3 +89,4 @@ jobs:
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
+
diff --git a/input.bib b/input.bib
new file mode 100644
index 0000000..f525305
--- /dev/null
+++ b/input.bib
@@ -0,0 +1,112 @@
+﻿
+@article{ WOS:001016714700004,
+Author = {White, Joel},
+Title = {Theoretical and Practical Paralogisms of Digital Immortality},
+Journal = {JOURNAL OF AESTHETICS AND PHENOMENOLOGY},
+Year = {2022},
+Volume = {9},
+Number = {2, SI},
+Pages = {155-172},
+Month = {JUL 3},
+Abstract = {Modern and contemporary transhumanism has seen a recent rise in academic
+   and popular relevance; specific naive metaphysical ideas, such as
+   immortality, have returned with this rise. This article refrains from
+   any ethical or political assessment of transhumanism. Still, it
+   critiques the exact metaphysical or idealistic nature of transhumanism
+   and its pursuit of digital immortality: the idea that, through
+   technological advancements, precisely in Artificial General
+   Intelligence, an immortal virtual ``self{''} will become possible. The
+   article follows the form of Immanuel Kant's ``Paralogisms{''} from the
+   Critique of Pure Reason, where Kant is concerned with the substantial,
+   immortal nature of the soul and its experiential impossibility. The
+   article will offer theoretical and practical paralogisms (false logical
+   inferences), arguing that the transhumanist claim that digital
+   immortality is possible fundamentally stems from two incorrect major
+   premises. The first concerns the substantial nature of information,
+   which informs the theoretical paralogisms; the second concerns infinite
+   transformation (pure plasticity), which informs the practical
+   paralogisms},
+Publisher = {ROUTLEDGE JOURNALS, TAYLOR \& FRANCIS LTD},
+Address = {2-4 PARK SQUARE, MILTON PARK, ABINGDON OX14 4RN, OXON, ENGLAND},
+Type = {Article},
+Language = {English},
+DOI = {10.1080/20539320.2022.2150463},
+ISSN = {2053-9320},
+EISSN = {2053-9339},
+Keywords = {Transhumanism; Critical Philosophy; Immanuel Kant; Entropy; Paralogisms;
+   Digital Immortality},
+Research-Areas = {Philosophy},
+Web-of-Science-Categories  = {Philosophy},
+Author-Email = {jhmw01@gmail.com},
+ORCID-Numbers = {White, Joel/0000-0001-6460-0564},
+Number-of-Cited-References = {30},
+Times-Cited = {0},
+Usage-Count-Last-180-days = {3},
+Usage-Count-Since-2013 = {15},
+Journal-ISO = {J. Aesthet. Phenomenol.},
+Doc-Delivery-Number = {K5GF0},
+Web-of-Science-Index = {Emerging Sources Citation Index (ESCI)},
+Unique-ID = {WOS:001016714700004},
+DA = {2025-06-26},
+}
+
+@article{ WOS:001322577100012,
+Author = {Kant, Vivek and Khanganba, Sanjram Premjit and Dixit, Sudhir},
+Title = {Sociopolitical Challenges to Digital Transformation of Rural
+   Communities: Learnings from a Case Study From Manipur, India},
+Journal = {IT PROFESSIONAL},
+Year = {2024},
+Volume = {26},
+Number = {4},
+Pages = {42-47},
+Month = {JUL-AUG},
+Abstract = {The United Nations Panel on Digital Cooperation, 2019, has emphasized
+   the inclusive growth of digital networks and digital public goods,
+   utilizing a multistakeholder systems approach. Similarly, the
+   information and communications technology (ICT) Innovation and
+   Intervention Program of the Government of India's Digital North East
+   Vision 2022 has also emphasized a need for inclusive growth of ICT in
+   the Northeast Region. In line with the above, this article presents
+   insights from a field study conducted in the rural parts of Manipur,
+   India, which incidentally can be found to be applicable to many rural
+   parts of the developing world. The article envisions a community-driven
+   sociodigital transformation of the Northeast Region of India. In this
+   quest, the article highlights sociopolitical challenges for digital
+   transformation and provides insights for inclusive ICT in such
+   regions-infrastructure as a utility for every citizen, smart governance
+   and services on demand, digital empowerment of citizens, social welfare,
+   capacity building, and community engagement.},
+Publisher = {IEEE COMPUTER SOC},
+Address = {10662 LOS VAQUEROS CIRCLE, PO BOX 3014, LOS ALAMITOS, CA 90720-1314 USA},
+Type = {Article},
+Language = {English},
+Affiliation = {Kant, V (Corresponding Author), Indian Inst Technol Kanpur, Kanpur 208016, India.
+   Kant, Vivek, Indian Inst Technol Kanpur, Kanpur 208016, India.
+   Khanganba, Sanjram Premjit, Indian Inst Technol Indore, Indore 452020, India.
+   Dixit, Sudhir, Basic Internet Fdn, Oslo, Norway.},
+DOI = {10.1109/MITP.2024.3433459},
+ISSN = {1520-9202},
+EISSN = {1941-045X},
+Keywords = {Technological innovation; Digital transformation; Government; Buildings;
+   Asia; Africa; Information and communication technology},
+Research-Areas = {Computer Science; Telecommunications},
+Web-of-Science-Categories  = {Computer Science, Information Systems; Computer Science, Software
+   Engineering; Telecommunications},
+Author-Email = {vkant@iitk.ac.in
+   sanjrampk@iiti.ac.in
+   sudhir.dixit@ieee.org},
+Affiliations = {Indian Institute of Technology System (IIT System); Indian Institute of
+   Technology (IIT) - Kanpur; Indian Institute of Technology System (IIT
+   System); Indian Institute of Technology (IIT) - Indore},
+ResearcherID-Numbers = {/ITU-6308-2023},
+ORCID-Numbers = {/0000-0002-6215-7500},
+Number-of-Cited-References = {7},
+Times-Cited = {0},
+Usage-Count-Last-180-days = {11},
+Usage-Count-Since-2013 = {22},
+Journal-ISO = {IT Prof.},
+Doc-Delivery-Number = {H3O9D},
+Web-of-Science-Index = {Science Citation Index Expanded (SCI-EXPANDED)},
+Unique-ID = {WOS:001322577100012},
+DA = {2025-06-26},
+}
diff --git a/test/files/test.txt b/input.txt
similarity index 100%
rename from test/files/test.txt
rename to input.txt
diff --git a/main.py b/main.py
index d48e02e..6698911 100644
--- a/main.py
+++ b/main.py
@@ -3,10 +3,10 @@
 
 from scystream.sdk.core import entrypoint
 from scystream.sdk.env.settings import (
-        EnvSettings,
-        InputSettings,
-        OutputSettings,
-        FileSettings
+    EnvSettings,
+    InputSettings,
+    OutputSettings,
+    FileSettings
 )
 from scystream.sdk.file_handling.s3_manager import S3Operations
 
@@ -81,7 +81,7 @@ def _preprocess_and_store(texts, settings):
     dtm, vocab = pre.generate_document_term_matrix()
 
     with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
-         tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
+            tempfile.NamedTemporaryFile(suffix="_vocab.pkl") as tmp_vocab:
 
         pickle.dump(dtm, tmp_dtm)
         tmp_dtm.flush()
@@ -108,40 +108,3 @@ def preprocess_bib_file(settings):
         attribute=settings.bib_input.SELECTED_ATTRIBUTE,
     )
     _preprocess_and_store(texts, settings)
-
-
-"""
-if __name__ == "__main__":
-    test = PreprocessBIB(
-        bib_input=BIBFileInput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="input-bucket",
-            FILE_PATH="input_file_path",
-            FILE_NAME="wos_export",
-            SELECTED_ATTRIBUTE="abstract"
-        ),
-        dtm_output=DTMFileOutput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="output-bucket",
-            FILE_PATH="output_file_path",
-            FILE_NAME="dtm_file_bib"
-        ),
-        vocab_output=VocabFileOutput(
-            S3_HOST="http://localhost",
-            S3_PORT="9000",
-            S3_ACCESS_KEY="minioadmin",
-            S3_SECRET_KEY="minioadmin",
-            BUCKET_NAME="output-bucket",
-            FILE_PATH="output_file_path",
-            FILE_NAME="vocab_file_bib"
-        )
-    )
-
-    preprocess_bib_file(test)
-"""
diff --git a/preprocessing/__init__.py b/preprocessing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/preprocessing/loader.py b/preprocessing/loader.py
index d55aac3..9ff51c6 100644
--- a/preprocessing/loader.py
+++ b/preprocessing/loader.py
@@ -5,17 +5,17 @@
 def normalize_text(text: str) -> str:
     if not text:
         return ""
-    # Remove curly braces
-    text = re.sub(r"[{}]", "", text)
 
-    # Remove LaTeX commands
-    text = re.sub(r"\\[a-zA-Z]+\s*(\{[^}]*\})?", "", text)
+    text = re.sub(r"\\[a-zA-Z]+\{([^}]*)\}", r"\1", text)
+
+    text = re.sub(r"\\[a-zA-Z]+", "", text)
 
-    # Remove LaTeX escaped quotes/accents
-    text = re.sub(r"\\""[a-zA-Z]", lambda m: m.group(0)[-1], text)
+    text = re.sub(r"[{}]", "", text)
+
+    text = re.sub(r'\\"([a-zA-Z])', r'\1', text)
 
     text = re.sub(r"\\'", "", text)
-    text = text.replace("'", "")
+
     text = re.sub(r"\s+", " ", text)
 
     return text.strip()
diff --git a/requirements.txt b/requirements.txt
index 3493ec1..e7737db 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ spacy==3.8.7
 nltk==3.9.1
 numpy==2.3.3
 bibtexparser==1.4.3
+pytest==9.0.1
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 0000000..5ce5cca
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,20 @@
+import pytest
+
+from preprocessing.core import Preprocessor
+
+
+@pytest.fixture
+def simple_texts():
+    return ["This is a test sentence.", "Another test sentence."]
+
+
+@pytest.fixture
+def preprocessor():
+    return Preprocessor(
+        language="en",
+        filter_stopwords=True,
+        unigram_normalizer="porter",
+        use_ngrams=True,
+        ngram_min=2,
+        ngram_max=3,
+    )
diff --git a/test/files/expected_dtm_from_bib.pkl b/test/files/expected_dtm_from_bib.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..812760358e55fa9a738dfd127f9a613172985d55
GIT binary patch
literal 7546
zcmdU!zityj7{q;c3=K6E^i-}XsG!5b1uqaKjwK=`c66KzcQpKobX}I$;u&}o2=+}g
z(oeg4-jjuk`!usN^L@K}y2fA5er{fl_P_S|dV71fo1HH=^=fvzy=ks5>iS|A+GpqW
zYPnf|ZtCr_2@j#2?v<f^y1qI{9>RR|HN2}X!}stpyqZ_@@rPg4xA1AwPOqBX-Rhrf
zeEmAy|2POHZS^KB7K`_P9)J7CeAI-?`?tSe9H!~>tU6O_jrE|M2^d}PvfOpo6ZSqs
z<5F|z96fK`ZNPcjm+OQ{Px<nxlHYmdIqz6H<*T~xf5+Na=}PC4+h^z;$}i2+`;>bg
zuAg>;I$sKxItQ93HJ|*%>wD$-q$l3}rEqS))c1$xqkhhlKefN-2~)1ty^>z{>iQU_
zyiz>%r~I>SfqcYMp7Z3-<>}tVtET4qoTYK@JU%zg<GQ|>^2Mj-(ms_#H8r1flzS?k
z=FH`3emZB*cfZz&*LrHc^q%KfI@P}4OZh!N7c0kWZjSVQ_alF5^?4mr4$bHKSYB-1
z^9XxRU!3}$(j0UyG!NwvpPQ4b_rBzJ{z+$a-m&KkyKZQlde435(7sAnnkTo<&^f8+
zo2n}{Z|*q_rKkPS_k+%f=A#_%OMc?rFHCyk<x8!!fA{D1Bfaz9N7!|QrT0quRNnK1
z$**;<q>u5Q(--T0Rh2{iweFSlG2Z)nu5c{pRO^~=sCw+aXs%;(<&S;ur0ZK_??>NU
r`RII3#cQr(E6typOMd55^IR9ZSM_n<u~@lYkDbT!QgL|vr*rlfHW@$`

literal 0
HcmV?d00001

diff --git a/test/files/expected_dtm_from_txt.pkl b/test/files/expected_dtm_from_txt.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..6ca1a740a3059ed64c47c2bd2d4a87965a077623
GIT binary patch
literal 1081
zcmb`GyAFad6ox@CF3xW5PRtHYy4i%m7chV%nkblB1~wNjB;7dfzNr`RUry54L_74$
z`7dn;IQqB1cbcUKS+U!f!7|BZ8tjU#+C)-DCF%ZBrb(V1lq?cOHR*1XB;CtaZKNjQ
zoM`68bfE=}ge!b8Jk#3K-IXf$>ATd~jH$Y|zNg&@g<&{<yjHUbM^Ri&?_T>Do4xZ1
zxs1H~Eyw$q`8l?;Z67&Keh++;Zt{_b4?+i_bHWGVW45KK2c6UQ+<)av_jCQ*Z2KO3
Xk30_@<UVHjAau~A!~d1^`cs1^LX$kw

literal 0
HcmV?d00001

diff --git a/test/files/expected_vocab_from_bib.pkl b/test/files/expected_vocab_from_bib.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..0641a70427f6ee85f8fb59a875a2c17cac43eaec
GIT binary patch
literal 9047
zcmZu%2bd&96=r0^-EFwraC>`qfQ_J{lFV62D%d;~#e|CV&Gzm#H#0rgGqZP~C<=<g
zGXRa42rA~B0|F|^oO8}O=dAyG6}o!v?)&zeSATU?z4z*+s=9MWE<5><{q)~C<zc-4
zNyndB#{2Pl{6<-)ldqL=gXeq;SU)Y+(oVmOZ^Ki9ck-gYnkGFU`}2GaiHdY3%R6N}
z!W*Q+HAi`|YGTUe0>g_zvp*=xc#OxAX0nvF%Xplp4H<Rw?x2;RcOB2yAxzpz!R2}k
zTUtvx%`)D=Q-+MXMcT}IWjw(PH6%)kezu%7vofB<xNf(VVcZl?8Zs&dMbawcX`Tsz
zsEc7`yph)dE%KzfTE?4rBq>$~Wjw>PLPY%{>GW0y?WB{5N@jU+Z#ZhUk___AVQnzd
z#8UHETgfQxtO?7Tc`gK^X5MZOpvnaZXFgocTdjO|8E=u5WtPYw_$e^?B5$(rYPPc4
z!XMb#%G-y*sFydhyqmW&^l$TQ<jcz`^t~PWHj^t5VXN~Z#g_+QZ%hZ2+*5-%5L&K5
z*xH_1_aOgukz%(q2y$CLvxTCxzbe}hLBE*BY|p`FpfsYDv?DCtj<>je1F2EAAYmCF
z0*&^1X%7lK6q~1Huq9=Ddmh~}$XZKfd<ULX%4_zHJR1Voh?N9W9>$~1q?>@TJMn~4
zk)7erJR7wAJa^%B&DEsU!uT>ioTptQTFO>L$4BsD9Wo4>T|1J;n^`|QJB5ah;wei<
zy*5NH<D+@LhD0m*T3X=KyJ8tLOt({v_into4n@1u*0T7}-64@Nz<WR<p&V=XghW+4
z%Gzx>cNyP{$DH}y8$78cMoUG8I8es-fjA*z?8(Y{U!Js`R-5<ZO%~{<&DBoc%AutD
zLo73e2O|WCGJXKGk$0LZ;sbf3r6poy9Z}_jc<T@r>hi%n<9d689ld_i>BHfV5q@>f
z$fI7;V|mNoot;Dj({ako62ep&A1`X?2<>9U2~b^#*kJJxQ2=6QyPFqDf#n~nD|n!0
zoDbsz!jFT#sF#YaPn3<wyW#~W2?s((n0qyW!9QHI7$8tnvo%0R48{vs$1a4alQGQD
zQMxA)QHU36NYqaINq6-u415Hbw;y*0MOwy><hc+)>`i(S-yg+e2-rynZgmPz8q(R+
zqj^3Agx!3%NbndQUrN`~R^Bb+Q+dkL(eAtu_k1jmS{r&CW)#nXgBj5u&vPN*74QU1
zq96Jtp2(YPNMt>tj8AiuIvaly&zMgQ5M@hgQpQgP^E#zPyLqpd?SOYZ#jVvz``~zI
z8K3UBvWS!asXSq(&_r3rXYi(=9huCTICi>&9h!@FICcyGN6(>@CT}!jHOrWYvBV2Q
z7}8$8mf`ah+hV<R8S^PwbI%ptz84yL&?=7@{xY5rB0_4?gnlN^25r9sJ9*4mV~eK@
z>CLds^ED(|De~QY<j@W#R~wExX$rP;OkTD9_%4qFHny{iH(I*d(t;O;FdqwhJZj@l
zUnotm1D;gM$L%$q3jrVCc4LUiOnbnnP$)+<Tb>0*t9Ilv&)FDYeCugEtyJJsPv<Rl
zNG+|U#lktfA>9L)&X7(KDxSeJRTCjfjIWHJ$s@~23&gp+L5K(;RMO_NcvIDs5Gg+P
zY#v?CQt_wrctWYD*B>ll^7DBvXhR7a&C2*WJmw0E=VFM~7`CFgfaim@#LT2KNZ}06
z14~+SJfEk82)Zuh*%}h<L=-OLi@;Ytka)p7sZ`{uuouV(bXmZ~*BA2oWuQr0ON|%t
zv~NKoDHdPMi*-m0CofO|V!exb)K(9d@U&9s?Wd+_xRf{7A)gLP9x;x08Lt;28uT;B
z`x0LCt*AX{^|5=&&U7^wMPJTiE-Sy3Ck=_kmNJBsm+@=}xDfGjp0aQc#kFkaD|o(!
zL@vC(60=*kxq>$s(&Wl3F+_W$8CK<2feX{2aiWZ`_FvjS<U-i1czsYSzMAKK%R~)p
z_OIas_tg=(s-}#u;Sm!*U&|9hc!j=>XM;BK2Qn-2$Lo1oVW0lX_zk?d4n?lKe<QDx
z9^*1^G9#rC**yJb-s<`d6y(t|ehWr>!+k4HxrQ~&w_#b^t%#)YM2nQS^B8)x9(o6)
zQqpFLcj_Q?X*&v}co$FFFD-NL=9v(HUI#F!_dp6Wp-(XH<;6M_dFw9Y_bK19pwr;{
zp;$A6L82n-Az^>OPrQ~D{ejelALKdvrG-#iw3f^GLn5CEG1o#zohFnhuKHn2VH=18
z>a*8D-Wuec_j*iXtBxCZgCUK1eguMRkD8}D34RoU7}Qzy$NYDdHJ9;?U^@KHdCpC|
z9=P-Jn|Y&eMeXFw91-vqUKqmURM?gy)~!5h{qp13Q1MVV?I(l@-!{%v#-9{T2#N`<
zaz4fLb;zmc(=xY);LnI2K;g_O=fuxqUwt#MOiciv<8gGF+V=CpgVJF}{(^`zghr+~
z`=aRH#NjWAS~c#Py1I<N%v)+mXry29U%T;Nl?^s=+x+-r-`7M3+S4siK1!_my0Ro;
z-$Iqoe8Ye3C-|oS+CtUW-%>upyMS-Qs>9m>8-KsU+xCX5Lg2fid%e2&o)`ou4fgih
z;`^eB8YcD9N|7Lk{6O??kE}lwZj_E}3jL87VSu<)`eTvO;IIrWxql*zhmdiCpNeC-
zwDU7z&NO^{{JAh!!%%Wr$Nh!ay1#w-rI?3pRRa7<tZNAMhs0m=h{?Xc;YlGR>6^3F
zZ+RgEs)JD(|IU8v<KXXkUca?KWDf*o{0AOsWgP^aKk|eSddVVp8-L=tpe^oa?hF3R
zqqcVW3r0v4gMCt;`73V<+SYUa#-rA!|IU+2$yDme|KM@CaDg9`@jrRW(k|=%ix+B0
z;J^Rob=7g_KNxAxJ~pubm*;DU->3iaOszwc5$)IDEqlQ>Em5PvW9@uN&a!kH+^HE_
z_P`$oOWMD|2koOHh!Z7^G<bTb=%CRCZ?0oe?<}p0XiTQ8)hM=0g0{KEqVWcgbdqeX
zq;(CR5~`|LX?=rdYmiY6ZGb??YH}W+i3U#^8t&XdO-PeEN_q`qLn}=+c+})Vnl^(5
zg>1XjkF*hb(N2r_j!g~TU^+Bm(hTIS+BnMks+ndRyvcM`S8*lHHF$9tmAKg(Wc`xn
zMSeMZ(B=kD2qiWmo=giOM+LY;3vFrexI6pNqG-)jQBKaS4W6$-SWS<1v`sXC+`U@T
zb`eWRa}1;d8az|A-D!&sZ18%8^)N;UiG(srk6<>H(rp{OFpNS9sXHmclIB8>4hF0B
zt30|Llv#t|S0=X8AsA>k^iXI<Nhjg$p_u@HCF}8=?f}iWXhC;u@W$atL5D#%L!ilF
zbSIdE>(NsV-C0-B8Z4^LHznN#Uf{m$rE+{J>2Ro^j`<KrM_`N5hs{3HtkFkJZEhdv
zD6@t~>>UIh?Y?xuh3@LUtOb;k?j|y~a6(2xq`M1irr{0v9>Q7;tCW0CQL>&W>0Yv;
zknS)^_eM+!ky<FYk1T7i#_7J;S=+Fe;&eYzwCOic)a&9#e7U6i>qv8%K@Wi0*}^lZ
zGU<WBv$^o7U~}4{2g!J6B@dQOHVvQ6=@?nN4(bM$bSwe~BDdVk({Vaf=;}_Mj)y^-
z4?Jiuf9V7>R-!&VM0qgDgdS=plFS07Yw9iOVP>Kbw8@-K)WIh2(@C;prU-D^k{+)7
z>jxHd-h>ws@Sn>S3!Q96g05C2T~+!WhF=p+9w9?*9C)NSl+xyQhaLsL4ggc^&?#cL
za{ES)7UKXFc}sf?;$?_non;NAr|R7J)ZWn1V|8v1n9~kDPL$w+;N!)9OvAbO6GR6g
z<Psn~5jMMPkm8AhP6Jy?x`a<pg5iWn<d2IbJsBEsOg#l$8rt%6I^+p}%6p%xqndIc
zp)+)p(B_as37it<<bcRNAJGm-=bjs=3F+z&4YZ`I>2rj6d$9L9l<I2wbVbXM&V*Pf
zSA*#al;V-_>55hxymenab|*9F%)D07nK~;TuDnC;gb!+`MNl?KK5i*1dP=6Y3NEx2
zyaUtFa~|c=g%h8by(_X+fPX-vUBZMuZ&88lqog=36x$O$N|&~)@Yt{8J{S(bn+v^b
z@B>3vo3<NfP{+hR^>jmfz`EXP(^+6$D443*-?L$=0bpf)nhwB0m7WgW$&nQ@+4yyi
zGNLyp^bG8?xkYhzCiG0`q6RthJ{Nhlm$fszEVi<eo+S&rrgx=hBLEMhkWpXb=salC
zyv@-0!j?SK_z3bGSvNp%&^A0iSNO%N8;;|2f$*!ed!M1_$+!@T%mWlX9~-8xb><?6
zE`%jk?ckV57oq6eS5I&+MC@j>x#tCTS@RZH-R95>Vb(gC$wTg%$7%6JvNLjxK`%xO
z7OGZ{T`X2m2N6Z|qJb_EMc9mVsn~(i*7Qq}t^$IJ^fFmdF8$~wVs@agTkaNsE|+mN
z%-#jiOJ%q8`jcKJY8O(>Ro`^z<syFw*{er-g_wzdK%!TQnYgA8h*yZ1L#QkLuN1xb
zJ}pB_x~ew7)TS<cUoCpE_4})2u+l-n{%WvPT{F>ZAdPS8QIoELG&Kk+k~1;A7VK9Y
z_EL>r2P+>&<#?q@`t=B;?f_4306)V=cX}g)83JMLrYd_=IKY(*Z<h7UTe2&Js?y;t
zBAvXWn`HJ@5ydq_3w~RJx9y_?k;KIAw~MHLpi9~Bz)@frg>r1k@J?7%(5s2}T_RmI
zPb~$%8(yS+`sQEKdk`$Ec6G#fuUl2WbGY-N_lb^V1=DLH<NMvR73hQH2V}?n0J%J(
z4>~pJ=W-!xUU}(5@ZZ|Ero*j$Exc(U&#ClbxS&vCw|cC*4oN8hutR#byIw}fO*q|v
zpeK~Se4&p({t&TG*Yr_vr+ED=TGGd`iTmjA=hquyeC`f}Zo+!CYfHKr-X0=VR=5RD
zWZ$dkR;bI+?yZVG4n}K`d3~Z!V3>U=r%z%zL%R_CDU1pc`+ouG(+Dl*<&r+r7##V3
D$S9&b

literal 0
HcmV?d00001

diff --git a/test/files/expected_vocab_from_txt.pkl b/test/files/expected_vocab_from_txt.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..0698bc3d6c3ccde57d064f0e7c0306221b561679
GIT binary patch
literal 400
zcmX|-&2GXl5QL#I5R-;~N?SPi*b^_}r+@^AB_L7*QZJQy>1~eYec82R`?51D&u8!c
ziN}Ap<Ib=Bg4mny(&`RU7V3C5R(FvHqbhT%tbT!XoyJ^GRwtN5r_S}g<d8s8>~2=~
zAgQXN+BFBO`xr&1E~cQ?00VB7HIY(`Tdj)v@M9VRNc7t9kWh?KbjRupL+?E1l`tWa
zBXPJ9B-QZt7<qIaF+t`!AvZ<h2A(0M+Ri$Ms7y&8T_9!Qqf74cD&Ei!Bd;L*U)~{E
Y5P)+Z$fNV&PYA!cH8K`)X`B1@33u~~TL1t6

literal 0
HcmV?d00001

diff --git a/test/files/input.txt b/test/files/input.txt
new file mode 100644
index 0000000..1755b0f
--- /dev/null
+++ b/test/files/input.txt
@@ -0,0 +1,4 @@
+Cats chase mice. Dogs chase cats.
+Birds fly high. Cats and dogs coexist.
+Mice hide from cats. Birds sing loudly.
+Cats and dogs coexist. Cats and dogs coexist.
diff --git a/test/test_full.py b/test/test_full.py
new file mode 100644
index 0000000..c12be20
--- /dev/null
+++ b/test/test_full.py
@@ -0,0 +1,213 @@
+import os
+import boto3
+import pytest
+import pickle
+import numpy as np
+
+from pathlib import Path
+from main import preprocess_bib_file, preprocess_txt_file
+from botocore.exceptions import ClientError
+
+MINIO_USER = "minioadmin"
+MINIO_PWD = "minioadmin"
+BUCKET_NAME = "testbucket"
+
+
+def ensure_bucket(s3, bucket):
+    try:
+        s3.head_bucket(Bucket=bucket)
+    except ClientError as e:
+        error_code = e.response["Error"]["Code"]
+        if error_code in ("404", "NoSuchBucket"):
+            s3.create_bucket(Bucket=bucket)
+        else:
+            raise
+
+
+def download_to_tmp(s3, bucket, key):
+    tmp_path = Path("/tmp") / key.replace("/", "_")
+    s3.download_file(bucket, key, str(tmp_path))
+    return tmp_path
+
+
+@pytest.fixture
+def s3_minio():
+    client = boto3.client(
+        "s3",
+        endpoint_url="http://localhost:9000",
+        aws_access_key_id=MINIO_USER,
+        aws_secret_access_key=MINIO_PWD
+    )
+    ensure_bucket(client, BUCKET_NAME)
+    return client
+
+
+def test_full_bib(s3_minio):
+    input_file_name = "input"
+    dtm_output_file_name = "dtm_file"
+    vocab_output_file_name = "vocab_file"
+
+    bib_path = Path(__file__).parent / "files" / f"{input_file_name}.bib"
+    bib_bytes = bib_path.read_bytes()
+
+    s3_minio.put_object(
+        Bucket=BUCKET_NAME,
+        Key=f"{input_file_name}.bib",
+        Body=bib_bytes
+    )
+
+    env = {
+        "bib_file_S3_HOST": "http://127.0.0.1",
+        "bib_file_S3_PORT": "9000",
+        "bib_file_S3_ACCESS_KEY": MINIO_USER,
+        "bib_file_S3_SECRET_KEY": MINIO_PWD,
+        "bib_file_BUCKET_NAME": BUCKET_NAME,
+        "bib_file_FILE_PATH": "",
+        "bib_file_FILE_NAME": input_file_name,
+        "bib_file_SELECTED_ATTRIBUTE": "abstract",
+
+        "dtm_output_S3_HOST": "http://127.0.0.1",
+        "dtm_output_S3_PORT": "9000",
+        "dtm_output_S3_ACCESS_KEY": MINIO_USER,
+        "dtm_output_S3_SECRET_KEY": MINIO_PWD,
+        "dtm_output_BUCKET_NAME":  BUCKET_NAME,
+        "dtm_output_FILE_PATH": "",
+        "dtm_output_FILE_NAME": dtm_output_file_name,
+
+        "vocab_output_S3_HOST": "http://127.0.0.1",
+        "vocab_output_S3_PORT": "9000",
+        "vocab_output_S3_ACCESS_KEY": MINIO_USER,
+        "vocab_output_S3_SECRET_KEY": MINIO_PWD,
+        "vocab_output_BUCKET_NAME": BUCKET_NAME,
+        "vocab_output_FILE_PATH": "",
+        "vocab_output_FILE_NAME": vocab_output_file_name,
+    }
+
+    for k, v in env.items():
+        os.environ[k] = v
+
+    preprocess_bib_file()
+
+    keys = [
+        o["Key"]
+        for o in s3_minio.list_objects_v2(
+            Bucket="testbucket").get("Contents", [])
+    ]
+
+    assert f"{dtm_output_file_name}.pkl" in keys
+    assert f"{vocab_output_file_name}.pkl" in keys
+
+    dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+        dtm_output_file_name}.pkl")
+    vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+        vocab_output_file_name}.pkl")
+
+    # Load produced results
+    with open(dtm_path, "rb") as f:
+        dtm = pickle.load(f)
+
+    with open(vocab_path, "rb") as f:
+        vocab = pickle.load(f)
+
+    # Load expected snapshot files
+    expected_vocab_path = Path(__file__).parent / \
+        "files" / "expected_vocab_from_bib.pkl"
+    expected_dtm_path = Path(__file__).parent / "files" / \
+        "expected_dtm_from_bib.pkl"
+
+    with open(expected_vocab_path, "rb") as f:
+        expected_vocab = pickle.load(f)
+
+    with open(expected_dtm_path, "rb") as f:
+        expected_dtm = pickle.load(f)
+
+    assert vocab == expected_vocab
+    np.testing.assert_array_equal(dtm, expected_dtm)
+
+
+def test_full_txt(s3_minio):
+    input_file_name = "input"
+    dtm_output_file_name = "dtm_txt_file"
+    vocab_output_file_name = "vocab_txt_file"
+
+    txt_path = Path(__file__).parent / "files" / f"{input_file_name}.txt"
+    txt_bytes = txt_path.read_bytes()
+
+    s3_minio.put_object(
+        Bucket=BUCKET_NAME,
+        Key=f"{input_file_name}.txt",
+        Body=txt_bytes
+    )
+
+    env = {
+        "txt_file_S3_HOST": "http://127.0.0.1",
+        "txt_file_S3_PORT": "9000",
+        "txt_file_S3_ACCESS_KEY": MINIO_USER,
+        "txt_file_S3_SECRET_KEY": MINIO_PWD,
+        "txt_file_BUCKET_NAME": BUCKET_NAME,
+        "txt_file_FILE_PATH": "",
+        "txt_file_FILE_NAME": input_file_name,
+
+        "dtm_output_S3_HOST": "http://127.0.0.1",
+        "dtm_output_S3_PORT": "9000",
+        "dtm_output_S3_ACCESS_KEY": MINIO_USER,
+        "dtm_output_S3_SECRET_KEY": MINIO_PWD,
+        "dtm_output_BUCKET_NAME": BUCKET_NAME,
+        "dtm_output_FILE_PATH": "",
+        "dtm_output_FILE_NAME": dtm_output_file_name,
+
+        "vocab_output_S3_HOST": "http://127.0.0.1",
+        "vocab_output_S3_PORT": "9000",
+        "vocab_output_S3_ACCESS_KEY": MINIO_USER,
+        "vocab_output_S3_SECRET_KEY": MINIO_PWD,
+        "vocab_output_BUCKET_NAME": BUCKET_NAME,
+        "vocab_output_FILE_PATH": "",
+        "vocab_output_FILE_NAME": vocab_output_file_name,
+    }
+
+    for k, v in env.items():
+        os.environ[k] = v
+
+    preprocess_txt_file()
+
+    keys = [
+        o["Key"]
+        for o in s3_minio.list_objects_v2(
+            Bucket=BUCKET_NAME).get("Contents", [])
+    ]
+
+    assert f"{dtm_output_file_name}.pkl" in keys
+    assert f"{vocab_output_file_name}.pkl" in keys
+
+    # Download produced files
+    dtm_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+                               dtm_output_file_name}.pkl")
+    vocab_path = download_to_tmp(s3_minio, BUCKET_NAME, f"{
+                                 vocab_output_file_name}.pkl")
+
+    # Load produced results
+    with open(dtm_path, "rb") as f:
+        dtm = pickle.load(f)
+
+    with open(vocab_path, "rb") as f:
+        vocab = pickle.load(f)
+
+    # Load expected snapshot files
+    expected_vocab_path = Path(__file__).parent / \
+        "files" / "expected_vocab_from_txt.pkl"
+    expected_dtm_path = Path(__file__).parent / \
+        "files" / "expected_dtm_from_txt.pkl"
+
+    with open(expected_vocab_path, "rb") as f:
+        expected_vocab = pickle.load(f)
+
+    with open(expected_dtm_path, "rb") as f:
+        expected_dtm = pickle.load(f)
+
+    # Assertions
+    assert vocab == expected_vocab
+
+    if hasattr(dtm, "toarray"):
+        np.testing.assert_array_equal(dtm.toarray(), expected_dtm.toarray())
+    else:
+        np.testing.assert_array_equal(dtm, expected_dtm)
diff --git a/test/test_loaders.py b/test/test_loaders.py
new file mode 100644
index 0000000..3c96468
--- /dev/null
+++ b/test/test_loaders.py
@@ -0,0 +1,33 @@
+import os
+import tempfile
+
+from preprocessing.loader import TxtLoader, BibLoader
+
+
+def test_txt_loader_reads_and_normalizes():
+    with tempfile.NamedTemporaryFile("w+", delete=False) as f:
+        f.write("Hello {World}\nSecond line")
+        fname = f.name
+
+    result = TxtLoader.load(fname)
+    os.unlink(fname)
+
+    assert result == ["Hello World", "Second line"]
+
+
+def test_bib_loader_extracts_attribute():
+    bib_content = r"""
+    @article{a,
+      abstract = {This is {Bib} \textbf{text}.},
+      title = {Ignore me}
+    }
+    """
+
+    with tempfile.NamedTemporaryFile("w+", delete=False) as f:
+        f.write(bib_content)
+        fname = f.name
+
+    result = BibLoader.load(fname, "abstract")
+    os.unlink(fname)
+
+    assert result == ["This is Bib text."]
diff --git a/test/test_normalize.py b/test/test_normalize.py
new file mode 100644
index 0000000..33dd6e5
--- /dev/null
+++ b/test/test_normalize.py
@@ -0,0 +1,17 @@
+from preprocessing.loader import normalize_text
+
+
+def test_normalize_removes_braces():
+    assert normalize_text("{abc}") == "abc"
+
+
+def test_normalize_removes_latex_commands():
+    assert normalize_text(r"\textbf{Hello}") == "Hello"
+
+
+def test_normalize_removes_accents():
+    assert normalize_text(r"\'a") == "a"
+
+
+def test_normalize_collapses_whitespace():
+    assert normalize_text("a    b   c") == "a b c"
diff --git a/test/test_preprocessor_unit.py b/test/test_preprocessor_unit.py
new file mode 100644
index 0000000..7829105
--- /dev/null
+++ b/test/test_preprocessor_unit.py
@@ -0,0 +1,26 @@
+def test_preprocessor_tokenization(preprocessor, simple_texts):
+    preprocessor.texts = simple_texts
+    preprocessor.analyze_texts()
+
+    assert len(preprocessor.token_frequency) > 0
+
+
+def test_preprocessor_bag_of_words(preprocessor, simple_texts):
+    preprocessor.texts = simple_texts
+    preprocessor.analyze_texts()
+    preprocessor.generate_bag_of_words()
+
+    assert len(preprocessor.bag_of_words) == 2
+    assert all(len(doc) > 0 for doc in preprocessor.bag_of_words)
+
+
+def test_generate_document_term_matrix(preprocessor, simple_texts):
+    preprocessor.texts = simple_texts
+    preprocessor.analyze_texts()
+    preprocessor.generate_bag_of_words()
+
+    dtm, vocab = preprocessor.generate_document_term_matrix()
+
+    assert dtm.shape[0] == 2
+    assert dtm.shape[1] == len(vocab)
+    assert dtm.sum() > 0

From db2ab5546e413a5007906a7aeadb624ec87a39eb Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Mon, 1 Dec 2025 21:56:11 -0600
Subject: [PATCH 2/7] chore: test cbc validity

---
 .github/workflows/ci.yaml | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 3939887..6a49819 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -25,9 +25,44 @@ jobs:
       - name: Run flake8
         uses: py-actions/flake8@v2
 
-  test:
+  validate-compute-block:
     runs-on: ubuntu-latest
     needs: lint-python
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+
+      - name: Intall dependencies
+        run: |
+          pip install -r requirements.txt
+
+      - name: Check cbcs
+        run: |
+          python3 - <<'EOF'
+          import main
+
+          from scystream.sdk.config import load_config, get_compute_block
+          from scystream.sdk.config.config_loader import _compare_configs
+          from pathlib import Path
+
+          CBC_PATH = Path("cbc.yaml")
+
+          if not CBC_PATH.exists():
+              raise FileNotFoundError("cbc.yaml not found in repo root.")
+
+          block_from_code = get_compute_block()
+          block_from_yaml = load_config(str(CBC_PATH))
+
+          _compare_configs(block_from_code, block_from_yaml)
+
+          print("cbc.yaml matches python code definition")
+          EOF
+
+  run-test:
+    runs-on: ubuntu-latest
+    needs: validate-compute-block
     services:
       minio:
         image: lazybit/minio
@@ -59,7 +94,7 @@ jobs:
   build:
     name: Build docker image
     runs-on: ubuntu-latest
-    needs: test
+    needs: run-test
     permissions:
       contents: read
       packages: write

From 182e71c04c232f556cc73dd13a71e30095cbb471 Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Mon, 1 Dec 2025 21:58:14 -0600
Subject: [PATCH 3/7] chore: rename workflow steps

---
 .github/workflows/ci.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 6a49819..786a9ed 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -26,6 +26,7 @@ jobs:
         uses: py-actions/flake8@v2
 
   validate-compute-block:
+    name: Validate Compute Block Config
     runs-on: ubuntu-latest
     needs: lint-python
     steps:
@@ -61,6 +62,7 @@ jobs:
           EOF
 
   run-test:
+    name: Run Tests
     runs-on: ubuntu-latest
     needs: validate-compute-block
     services:
@@ -92,7 +94,7 @@ jobs:
         run: pytest -vv
 
   build:
-    name: Build docker image
+    name: Build Docker Image
     runs-on: ubuntu-latest
     needs: run-test
     permissions:

From ab58dbb75b9aa93f2faf7f763aea0fdf24cdffee Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Wed, 3 Dec 2025 13:50:20 -0600
Subject: [PATCH 4/7] chore: add logging

---
 main.py                 | 24 +++++++++++++++++++++++-
 preprocessing/core.py   | 21 ++++++++++++++++++---
 preprocessing/loader.py |  5 +++++
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/main.py b/main.py
index 6698911..64b9e21 100644
--- a/main.py
+++ b/main.py
@@ -1,5 +1,6 @@
 import pickle
 import tempfile
+import logging
 
 from scystream.sdk.core import entrypoint
 from scystream.sdk.env.settings import (
@@ -14,6 +15,13 @@
 from preprocessing.loader import TxtLoader, BibLoader
 
 
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
 class DTMFileOutput(FileSettings, OutputSettings):
     __identifier__ = "dtm_output"
 
@@ -66,6 +74,8 @@ class PreprocessBIB(EnvSettings):
 
 def _preprocess_and_store(texts, settings):
     """Shared preprocessing logic for TXT and BIB."""
+    logger.info(f"Starting preprocessing with {len(texts)} documents")
+
     pre = Preprocessor(
         language=settings.LANGUAGE,
         filter_stopwords=settings.FILTER_STOPWORDS,
@@ -74,10 +84,12 @@ def _preprocess_and_store(texts, settings):
         ngram_min=settings.NGRAM_MIN,
         ngram_max=settings.NGRAM_MAX,
     )
-    pre.texts = texts
 
+    pre.texts = texts
     pre.analyze_texts()
+
     pre.generate_bag_of_words()
+
     dtm, vocab = pre.generate_document_term_matrix()
 
     with tempfile.NamedTemporaryFile(suffix="_dtm.pkl") as tmp_dtm, \
@@ -89,20 +101,30 @@ def _preprocess_and_store(texts, settings):
         pickle.dump(vocab, tmp_vocab)
         tmp_vocab.flush()
 
+        logger.info("Uploading DTM to S3...")
         S3Operations.upload(settings.dtm_output, tmp_dtm.name)
+
+        logger.info("Uploading vocabulary to S3...")
         S3Operations.upload(settings.vocab_output, tmp_vocab.name)
 
+    logger.info("Preprocessing completed successfully.")
+
 
 @entrypoint(PreprocessTXT)
 def preprocess_txt_file(settings):
+    logger.info("Downloading TXT input from S3...")
     S3Operations.download(settings.txt_input, "input.txt")
+
     texts = TxtLoader.load("./input.txt")
+
     _preprocess_and_store(texts, settings)
 
 
 @entrypoint(PreprocessBIB)
 def preprocess_bib_file(settings):
+    logger.info("Downloading BIB input from S3...")
     S3Operations.download(settings.bib_input, "input.bib")
+
     texts = BibLoader.load(
         "./input.bib",
         attribute=settings.bib_input.SELECTED_ATTRIBUTE,
diff --git a/preprocessing/core.py b/preprocessing/core.py
index 4db4585..dba2a8d 100644
--- a/preprocessing/core.py
+++ b/preprocessing/core.py
@@ -1,3 +1,4 @@
+import logging
 import spacy
 import numpy as np
 
@@ -9,6 +10,7 @@
     "en": "en_core_web_sm",
     "de": "de_core_news_sm"
 }
+logger = logging.getLogger(__name__)
 
 
 class Preprocessor:
@@ -21,6 +23,12 @@ def __init__(
         ngram_min: int = 2,
         ngram_max: int = 3,
     ):
+        logger.info(
+            "Init Preprocessor (lang=%s, filter_stopwords=%s, ngrams=%s)",
+            language,
+            filter_stopwords,
+            use_ngrams,
+        )
         self.language = language
         self.filter_stopwords = filter_stopwords
         self.unigram_normalizer = unigram_normalizer
@@ -58,6 +66,7 @@ def filter_tokens(
         ]
 
     def analyze_texts(self):
+        logger.info(f"Analyzing {len(self.texts)} texts...")
         porter = PorterStemmer()
         for text in self.texts:
             doc = self.nlp(text)
@@ -67,8 +76,8 @@ def analyze_texts(self):
 
             for sentence in doc.sents:
                 filtered_tokens = self.filter_tokens(
-                        list(sentence),
-                        self.filter_stopwords
+                    list(sentence),
+                    self.filter_stopwords
                 )
                 normalized_tokens = [
                     self.normalize_token(t, porter) for t in filtered_tokens
@@ -93,6 +102,10 @@ def analyze_texts(self):
             if ngram_list:
                 self.ngram_frequency.update(ngram_list)
                 self.ngram_document_frequency.update(set(ngram_list))
+        logger.info(
+            f"Finished analyzing texts: {self.token_frequency} unigrams, {
+                self.ngram_frequency} n-grams",
+        )
 
     def normalize_token(
         self,
@@ -110,6 +123,7 @@ def normalize_token(
         return word
 
     def generate_bag_of_words(self):
+        logger.info("Generating bag-of-words...")
         porter = PorterStemmer()
         self.bag_of_words = []
 
@@ -177,7 +191,7 @@ def generate_document_term_matrix(self) -> (np.ndarray, dict):
             dtm (np.ndarray): shape = (num_docs, num_terms)
             vocab (dict): mapping term -> column index
         """
-
+        logger.info("Building document-term-matrix...")
         all_terms = set()
         for doc in self.bag_of_words:
             for t in doc:
@@ -194,4 +208,5 @@ def generate_document_term_matrix(self) -> (np.ndarray, dict):
                 term_idx = vocab[token["term"]]
                 dtm[doc_idx, term_idx] += 1
 
+        logger.info(f"Matrix shape: {dtm.shape} | Vocab size: {len(vocab)}")
         return dtm, vocab
diff --git a/preprocessing/loader.py b/preprocessing/loader.py
index 9ff51c6..50d0177 100644
--- a/preprocessing/loader.py
+++ b/preprocessing/loader.py
@@ -1,6 +1,9 @@
+import logging
 import re
 import bibtexparser
 
+logger = logging.getLogger(__name__)
+
 
 def normalize_text(text: str) -> str:
     if not text:
@@ -24,6 +27,7 @@ def normalize_text(text: str) -> str:
 class TxtLoader:
     @staticmethod
     def load(file_path: str) -> list[str]:
+        logger.info("Loading TXT file...")
         with open(file_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
         return [normalize_text(line) for line in lines]
@@ -32,6 +36,7 @@ def load(file_path: str) -> list[str]:
 class BibLoader:
     @staticmethod
     def load(file_path: str, attribute: str) -> list[str]:
+        logger.info(f"Loading BIB file (attribute={attribute})...")
         with open(file_path, "r", encoding="utf-8") as f:
             bib_database = bibtexparser.load(f)
 

From 8868ef1cf9a6d82c61f384d3767b77062fe84f24 Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Wed, 10 Dec 2025 16:46:47 -0600
Subject: [PATCH 5/7] fix: use absolute path for file download

---
 .github/workflows/ci.yaml |   2 +-
 cbc.yaml                  |  12 ++--
 input.bib                 | 112 --------------------------------------
 input.txt                 |   4 --
 main.py                   |  13 +++--
 5 files changed, 16 insertions(+), 127 deletions(-)
 delete mode 100644 input.bib
 delete mode 100644 input.txt

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 786a9ed..60e9630 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -38,7 +38,7 @@ jobs:
       - name: Intall dependencies
         run: |
           pip install -r requirements.txt
-
+      
       - name: Check cbcs
         run: |
           python3 - <<'EOF'
diff --git a/cbc.yaml b/cbc.yaml
index 0932961..a24e387 100644
--- a/cbc.yaml
+++ b/cbc.yaml
@@ -1,10 +1,11 @@
 author: Paul Kalhorn 
 description: Language preprocessing for .txt or .bib files
-docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing
+docker_image: ghcr.io/rwth-time/language-preprocessing/language-preprocessing 
 entrypoints:
   preprocess_bib_file:
-    description: Entrypoint for preprocessing a .bib file
+    description: Entrypoint for preprocessing a .bib file 
     envs:
+      BIB_DOWNLOAD_PATH: /tmp/input.bib
       FILTER_STOPWORDS: true
       LANGUAGE: en
       NGRAM_MAX: 3
@@ -23,7 +24,7 @@ entrypoints:
           bib_file_S3_PORT: null
           bib_file_S3_SECRET_KEY: null
           bib_file_SELECTED_ATTRIBUTE: Abstract
-        description: The bib file, aswell as one attribute selected for preprocessing 
+        description: The bib file, aswell as one attribute selected for preprocessing
         type: file
     outputs:
       dtm_output:
@@ -36,7 +37,7 @@ entrypoints:
           dtm_output_S3_HOST: null
           dtm_output_S3_PORT: null
           dtm_output_S3_SECRET_KEY: null
-        description: Numpy representation of document-term matrix as .pkl file 
+        description: Numpy representation of document-term matrix as .pkl file
         type: file
       vocab_output:
         config:
@@ -57,6 +58,7 @@ entrypoints:
       LANGUAGE: en
       NGRAM_MAX: 3
       NGRAM_MIN: 2
+      TXT_DOWNLOAD_PATH: /tmp/input.txt
       UNIGRAM_NORMALIZER: porter
       USE_NGRAMS: true
     inputs:
@@ -70,7 +72,7 @@ entrypoints:
           txt_file_S3_HOST: null
           txt_file_S3_PORT: null
           txt_file_S3_SECRET_KEY: null
-        description: A .txt file 
+        description: A .txt file
         type: file
     outputs:
       dtm_output:
diff --git a/input.bib b/input.bib
deleted file mode 100644
index f525305..0000000
--- a/input.bib
+++ /dev/null
@@ -1,112 +0,0 @@
-﻿
-@article{ WOS:001016714700004,
-Author = {White, Joel},
-Title = {Theoretical and Practical Paralogisms of Digital Immortality},
-Journal = {JOURNAL OF AESTHETICS AND PHENOMENOLOGY},
-Year = {2022},
-Volume = {9},
-Number = {2, SI},
-Pages = {155-172},
-Month = {JUL 3},
-Abstract = {Modern and contemporary transhumanism has seen a recent rise in academic
-   and popular relevance; specific naive metaphysical ideas, such as
-   immortality, have returned with this rise. This article refrains from
-   any ethical or political assessment of transhumanism. Still, it
-   critiques the exact metaphysical or idealistic nature of transhumanism
-   and its pursuit of digital immortality: the idea that, through
-   technological advancements, precisely in Artificial General
-   Intelligence, an immortal virtual ``self{''} will become possible. The
-   article follows the form of Immanuel Kant's ``Paralogisms{''} from the
-   Critique of Pure Reason, where Kant is concerned with the substantial,
-   immortal nature of the soul and its experiential impossibility. The
-   article will offer theoretical and practical paralogisms (false logical
-   inferences), arguing that the transhumanist claim that digital
-   immortality is possible fundamentally stems from two incorrect major
-   premises. The first concerns the substantial nature of information,
-   which informs the theoretical paralogisms; the second concerns infinite
-   transformation (pure plasticity), which informs the practical
-   paralogisms},
-Publisher = {ROUTLEDGE JOURNALS, TAYLOR \& FRANCIS LTD},
-Address = {2-4 PARK SQUARE, MILTON PARK, ABINGDON OX14 4RN, OXON, ENGLAND},
-Type = {Article},
-Language = {English},
-DOI = {10.1080/20539320.2022.2150463},
-ISSN = {2053-9320},
-EISSN = {2053-9339},
-Keywords = {Transhumanism; Critical Philosophy; Immanuel Kant; Entropy; Paralogisms;
-   Digital Immortality},
-Research-Areas = {Philosophy},
-Web-of-Science-Categories  = {Philosophy},
-Author-Email = {jhmw01@gmail.com},
-ORCID-Numbers = {White, Joel/0000-0001-6460-0564},
-Number-of-Cited-References = {30},
-Times-Cited = {0},
-Usage-Count-Last-180-days = {3},
-Usage-Count-Since-2013 = {15},
-Journal-ISO = {J. Aesthet. Phenomenol.},
-Doc-Delivery-Number = {K5GF0},
-Web-of-Science-Index = {Emerging Sources Citation Index (ESCI)},
-Unique-ID = {WOS:001016714700004},
-DA = {2025-06-26},
-}
-
-@article{ WOS:001322577100012,
-Author = {Kant, Vivek and Khanganba, Sanjram Premjit and Dixit, Sudhir},
-Title = {Sociopolitical Challenges to Digital Transformation of Rural
-   Communities: Learnings from a Case Study From Manipur, India},
-Journal = {IT PROFESSIONAL},
-Year = {2024},
-Volume = {26},
-Number = {4},
-Pages = {42-47},
-Month = {JUL-AUG},
-Abstract = {The United Nations Panel on Digital Cooperation, 2019, has emphasized
-   the inclusive growth of digital networks and digital public goods,
-   utilizing a multistakeholder systems approach. Similarly, the
-   information and communications technology (ICT) Innovation and
-   Intervention Program of the Government of India's Digital North East
-   Vision 2022 has also emphasized a need for inclusive growth of ICT in
-   the Northeast Region. In line with the above, this article presents
-   insights from a field study conducted in the rural parts of Manipur,
-   India, which incidentally can be found to be applicable to many rural
-   parts of the developing world. The article envisions a community-driven
-   sociodigital transformation of the Northeast Region of India. In this
-   quest, the article highlights sociopolitical challenges for digital
-   transformation and provides insights for inclusive ICT in such
-   regions-infrastructure as a utility for every citizen, smart governance
-   and services on demand, digital empowerment of citizens, social welfare,
-   capacity building, and community engagement.},
-Publisher = {IEEE COMPUTER SOC},
-Address = {10662 LOS VAQUEROS CIRCLE, PO BOX 3014, LOS ALAMITOS, CA 90720-1314 USA},
-Type = {Article},
-Language = {English},
-Affiliation = {Kant, V (Corresponding Author), Indian Inst Technol Kanpur, Kanpur 208016, India.
-   Kant, Vivek, Indian Inst Technol Kanpur, Kanpur 208016, India.
-   Khanganba, Sanjram Premjit, Indian Inst Technol Indore, Indore 452020, India.
-   Dixit, Sudhir, Basic Internet Fdn, Oslo, Norway.},
-DOI = {10.1109/MITP.2024.3433459},
-ISSN = {1520-9202},
-EISSN = {1941-045X},
-Keywords = {Technological innovation; Digital transformation; Government; Buildings;
-   Asia; Africa; Information and communication technology},
-Research-Areas = {Computer Science; Telecommunications},
-Web-of-Science-Categories  = {Computer Science, Information Systems; Computer Science, Software
-   Engineering; Telecommunications},
-Author-Email = {vkant@iitk.ac.in
-   sanjrampk@iiti.ac.in
-   sudhir.dixit@ieee.org},
-Affiliations = {Indian Institute of Technology System (IIT System); Indian Institute of
-   Technology (IIT) - Kanpur; Indian Institute of Technology System (IIT
-   System); Indian Institute of Technology (IIT) - Indore},
-ResearcherID-Numbers = {/ITU-6308-2023},
-ORCID-Numbers = {/0000-0002-6215-7500},
-Number-of-Cited-References = {7},
-Times-Cited = {0},
-Usage-Count-Last-180-days = {11},
-Usage-Count-Since-2013 = {22},
-Journal-ISO = {IT Prof.},
-Doc-Delivery-Number = {H3O9D},
-Web-of-Science-Index = {Science Citation Index Expanded (SCI-EXPANDED)},
-Unique-ID = {WOS:001322577100012},
-DA = {2025-06-26},
-}
diff --git a/input.txt b/input.txt
deleted file mode 100644
index 1755b0f..0000000
--- a/input.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-Cats chase mice. Dogs chase cats.
-Birds fly high. Cats and dogs coexist.
-Mice hide from cats. Birds sing loudly.
-Cats and dogs coexist. Cats and dogs coexist.
diff --git a/main.py b/main.py
index 64b9e21..aa03bf3 100644
--- a/main.py
+++ b/main.py
@@ -14,7 +14,6 @@
 from preprocessing.core import Preprocessor
 from preprocessing.loader import TxtLoader, BibLoader
 
-
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
@@ -54,6 +53,8 @@ class PreprocessTXT(EnvSettings):
     NGRAM_MIN: int = 2
     NGRAM_MAX: int = 3
 
+    TXT_DOWNLOAD_PATH: str = "/tmp/input.txt"
+
     txt_input: TXTFileInput
     dtm_output: DTMFileOutput
     vocab_output: VocabFileOutput
@@ -67,6 +68,8 @@ class PreprocessBIB(EnvSettings):
     NGRAM_MIN: int = 2
     NGRAM_MAX: int = 3
 
+    BIB_DOWNLOAD_PATH: str = "/tmp/input.bib"
+
     bib_input: BIBFileInput
     dtm_output: DTMFileOutput
     vocab_output: VocabFileOutput
@@ -113,9 +116,9 @@ def _preprocess_and_store(texts, settings):
 @entrypoint(PreprocessTXT)
 def preprocess_txt_file(settings):
     logger.info("Downloading TXT input from S3...")
-    S3Operations.download(settings.txt_input, "input.txt")
+    S3Operations.download(settings.txt_input, settings.TXT_DOWNLOAD_PATH)
 
-    texts = TxtLoader.load("./input.txt")
+    texts = TxtLoader.load(settings.TXT_DOWNLOAD_PATH)
 
     _preprocess_and_store(texts, settings)
 
@@ -123,10 +126,10 @@ def preprocess_txt_file(settings):
 @entrypoint(PreprocessBIB)
 def preprocess_bib_file(settings):
     logger.info("Downloading BIB input from S3...")
-    S3Operations.download(settings.bib_input, "input.bib")
+    S3Operations.download(settings.bib_input, settings.BIB_DOWNLOAD_PATH)
 
     texts = BibLoader.load(
-        "./input.bib",
+        settings.BIB_DOWNLOAD_PATH,
         attribute=settings.bib_input.SELECTED_ATTRIBUTE,
     )
     _preprocess_and_store(texts, settings)

From d159e80d9ab57156661356b370fe7180c223a8a4 Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Wed, 10 Dec 2025 16:48:38 -0600
Subject: [PATCH 6/7] fix: use lemma in cbc

---
 cbc.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cbc.yaml b/cbc.yaml
index a24e387..d20bcdf 100644
--- a/cbc.yaml
+++ b/cbc.yaml
@@ -10,7 +10,7 @@ entrypoints:
       LANGUAGE: en
       NGRAM_MAX: 3
       NGRAM_MIN: 2
-      UNIGRAM_NORMALIZER: porter
+      UNIGRAM_NORMALIZER: lemma
       USE_NGRAMS: true
     inputs:
       bib_input:
@@ -59,7 +59,7 @@ entrypoints:
       NGRAM_MAX: 3
       NGRAM_MIN: 2
       TXT_DOWNLOAD_PATH: /tmp/input.txt
-      UNIGRAM_NORMALIZER: porter
+      UNIGRAM_NORMALIZER: lemma
       USE_NGRAMS: true
     inputs:
       txt_input:

From 7994335a7eca8eddea66b59a92a2e51e846d8a7b Mon Sep 17 00:00:00 2001
From: PaulKalho <kalhornpaul@gmail.com>
Date: Wed, 10 Dec 2025 16:57:58 -0600
Subject: [PATCH 7/7] fix: use porter in tests

---
 test/test_full.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_full.py b/test/test_full.py
index c12be20..c97b370 100644
--- a/test/test_full.py
+++ b/test/test_full.py
@@ -57,6 +57,8 @@ def test_full_bib(s3_minio):
     )
 
     env = {
+        "UNIGRAM_NORMALIZER": "porter",
+
         "bib_file_S3_HOST": "http://127.0.0.1",
         "bib_file_S3_PORT": "9000",
         "bib_file_S3_ACCESS_KEY": MINIO_USER,
@@ -140,6 +142,8 @@ def test_full_txt(s3_minio):
     )
 
     env = {
+        "UNIGRAM_NORMALIZER": "porter",
+
         "txt_file_S3_HOST": "http://127.0.0.1",
         "txt_file_S3_PORT": "9000",
         "txt_file_S3_ACCESS_KEY": MINIO_USER,