Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cbc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ entrypoints:
MAX_ITER: 10
N_TOPICS: 5
N_TOP_WORDS: 10
DTM_DOWNLOAD_PATH: /tmp/dtm.pkl
VOCAB_DOWNLOAD_PATH: /tmp/vocab.pkl
inputs:
dtm:
config:
Expand Down
Binary file removed dtm.pkl
Binary file not shown.
11 changes: 7 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ class LDATopicModeling(EnvSettings):
LEARNING_METHOD: str = "batch"
N_TOP_WORDS: int = 10

DTM_DOWNLOAD_PATH: str = "/tmp/dtm.pkl"
VOCAB_DOWNLOAD_PATH: str = "/tmp/vocab.pkl"

vocab: VocabFileInput
dtm: DTMFileInput

Expand All @@ -70,19 +73,19 @@ def lda_topic_modeling(settings):
logger.info("Starting LDA topic modeling pipeline…")

logger.info("Downloading vocabulary file...")
S3Operations.download(settings.vocab, "vocab.pkl")
S3Operations.download(settings.vocab, settings.VOCAB_DOWNLOAD_PATH)

logger.info("Loading vocab.pkl from disk...")
with open("vocab.pkl", "rb") as f:
with open(settings.VOCAB_DOWNLOAD_PATH, "rb") as f:
vocab = pickle.load(f)

logger.info(f"Loaded vocab with {len(vocab)} terms.")

logger.info("Downloading DTM file...")
S3Operations.download(settings.dtm, "dtm.pkl")
S3Operations.download(settings.dtm, settings.DTM_DOWNLOAD_PATH)

logger.info("Loading dtm.pkl from disk...")
with open("dtm.pkl", "rb") as f:
with open(settings.DTM_DOWNLOAD_PATH, "rb") as f:
dtm = pickle.load(f)

logger.info(f"Loaded DTM with shape {dtm.shape}")
Expand Down
Binary file removed vocab.pkl
Binary file not shown.
Loading