Skip to content

Commit d83f167

Browse files
categorical check
1 parent f4119eb commit d83f167

File tree

4 files changed

+44
-14
lines changed

4 files changed

+44
-14
lines changed

python/src/lazylearn/ingestion/ingestion_pipeline.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
from ingestion.ingestion_pipeline_steps.interpreter_step import ( # noqa
33
ColumnTypeInterpreter,
44
)
5-
from ingestion.ingestion_pipeline_steps.summary_stats_step import SummaryStatistics
5+
from ingestion.ingestion_pipeline_steps.summary_stats_step import ( # noqa
6+
SummaryStatistics,
7+
)
68
from pipeline.pipeline import IngestionPipeline
79

810

python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,48 @@ def apply(self, pipeline: IngestionPipeline):
2323
pipeline.column_type_map = column_types
2424

2525
def analyze_column(self, column: Series):
26-
# is it numeric?
27-
values = set(column)
28-
types = set([type(value) for value in values])
26+
"""
27+
28+
:param column:
29+
:return:
30+
"""
31+
values = column.tolist()
32+
types = [type(value) for value in values]
33+
34+
if self.categorical_test(values):
35+
return "categorical"
2936

30-
if self.numeric_test(types):
37+
elif self.numeric_test(types):
3138
return "numeric"
3239

3340
return "object"
3441

3542
@staticmethod
36-
def numeric_test(types: set):
37-
return all([item == float or item == int for item in types])
43+
def categorical_test(values: list):
44+
"""
45+
Tests whether a column is of categorical type.
46+
This is decided as the case if the number of unique values is
47+
less than 5% of the total number of values in the column.
48+
49+
:param values: list of values of any type
50+
:return: True if attr is categorical, False otherwise
51+
"""
52+
n_total = len(values)
53+
n_unique = len(set(values))
54+
percentage_unique = n_unique / n_total
55+
56+
if percentage_unique < 0.05:
57+
return True
58+
return False
59+
60+
@staticmethod
61+
def numeric_test(types: list):
62+
"""
63+
64+
:param types:
65+
:return:
66+
"""
67+
return all([item == float or item == int for item in set(types)])
3868

3969
@staticmethod
4070
def string_test(types: set):
@@ -43,7 +73,3 @@ def string_test(types: set):
4373
@staticmethod
4474
def date_check(types: set):
4575
raise NotImplementedError
46-
47-
@staticmethod
48-
def categorical_test(values: set):
49-
raise NotImplementedError

python/src/lazylearn/ingestion/ingestion_pipeline_steps/summary_stats_step.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ def apply(self, pipeline: IngestionPipeline):
1515
for column in pipeline.column_type_map
1616
if pipeline.column_type_map[column] == "numeric"
1717
]
18-
df = pipeline.df
1918

2019
for attr in numeric_attributes:
21-
pipeline.summary_stats[attr] = pipeline.df[attr].describe().to_dict()
20+
pipeline.summary_stats[attr] = (
21+
pipeline.df[attr].describe().to_dict()
22+
) # noqa

python/src/test/ingestion/ingestion_pipeline_steps/test_summary_stats_step.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import pandas as pd
12
from ingestion.ingestion_pipeline_steps.summary_stats_step import ( # noqa
23
SummaryStatistics,
34
)
@@ -7,7 +8,7 @@
78

89
def test_iris_stats():
910
pipeline = IngestionPipeline()
10-
pipeline.df = load_iris(return_X_y=True, as_frame=True)[0]
11+
pipeline.df = pd.concat(load_iris(return_X_y=True, as_frame=True), axis=1)
1112
pipeline.column_type_map = {
1213
"sepal length (cm)": "numeric",
1314
"sepal width (cm)": "numeric",

0 commit comments

Comments
 (0)