Skip to content

Commit 0464761

Browse files
ColumnTypeInterpreter understands numeric
1 parent 3c08a24 commit 0464761

File tree

6 files changed

+73
-3
lines changed

6 files changed

+73
-3
lines changed

Pipfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ verify_ssl = true
77
loguru = "==0.6.*"
88
pandas = "==1.5.*"
99
scikit-learn = "*"
10+
tqdm = "*"
1011

1112
[dev-packages]
1213
black = "==23.*"

Pipfile.lock

Lines changed: 9 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

python/src/lazylearn/ingestion/ingestion_pipeline.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
from pipeline.pipeline import IngestionPipeline, PipelineStep
1+
from ingestion.ingestion_pipeline_steps.data_parser_step import DataSourceParser # noqa
2+
from ingestion.ingestion_pipeline_steps.interpreter_step import ColumnTypeInterpreter # noqa
3+
from pipeline.pipeline import IngestionPipeline
24

35

46
class Ingestion:
@@ -11,7 +13,7 @@ def run(self, data):
1113

1214
pipeline.add(DataSourceParser(data))
1315

14-
pipeline.add(ColumnInterpreter())
16+
pipeline.add(ColumnTypeInterpreter())
1517

1618
pipeline.run()
1719

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from pandas import Series
2+
from pipeline.pipeline import IngestionPipeline
3+
from tqdm import tqdm
4+
5+
6+
class ColumnTypeInterpreter:
7+
def apply(self, pipeline: IngestionPipeline):
8+
"""
9+
10+
:param pipeline: parent IngestionPipeline
11+
:return:
12+
"""
13+
columns = pipeline.df.columns
14+
column_types = {}
15+
16+
for column_name in tqdm(columns):
17+
column_types[column_name] = self.analyze_column(pipeline.df[column_name]) # noqa
18+
19+
pipeline.column_type_map = column_types
20+
21+
def analyze_column(self, column: Series):
22+
# is it numeric?
23+
values = set(column)
24+
types = set([type(value) for value in values])
25+
26+
if self.numeric_test(types):
27+
return "numeric"
28+
29+
return "object"
30+
31+
@staticmethod
32+
def numeric_test(types: set):
33+
return all([item == float or item == int for item in types])
34+
35+
@staticmethod
36+
def string_test(types: set):
37+
raise NotImplementedError
38+
39+
@staticmethod
40+
def date_check(types: set):
41+
raise NotImplementedError

python/src/lazylearn/pipeline/pipeline.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def __init__(self):
2727
super().__init__()
2828
self.raw_data = None
2929
self.df: DataFrame = None
30+
self.column_type_map: dict = None
3031

3132
def response(self):
3233
return Dataset
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from ingestion.ingestion_pipeline_steps.interpreter_step import ColumnTypeInterpreter
2+
from pipeline.pipeline import IngestionPipeline
3+
from sklearn.datasets import load_iris
4+
5+
6+
def test_iris_okay():
7+
pipeline = IngestionPipeline()
8+
pipeline.df = load_iris(return_X_y=True, as_frame=True)[0]
9+
pipeline.add(ColumnTypeInterpreter())
10+
pipeline.run()
11+
12+
assert pipeline.column_type_map == {
13+
"sepal length (cm)": "numeric",
14+
"sepal width (cm)": "numeric",
15+
"petal length (cm)": "numeric",
16+
"petal width (cm)": "numeric",
17+
}

0 commit comments

Comments
 (0)