categorical check

frederikhoengaard · frederikhoengaard · commit d83f16701fd9 · 2023-05-05T22:55:27.000+02:00
diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline.py b/python/src/lazylearn/ingestion/ingestion_pipeline.py
@@ -2,7 +2,9 @@
 from ingestion.ingestion_pipeline_steps.interpreter_step import (  # noqa
     ColumnTypeInterpreter,
 )
-from ingestion.ingestion_pipeline_steps.summary_stats_step import SummaryStatistics
+from ingestion.ingestion_pipeline_steps.summary_stats_step import (  # noqa
+    SummaryStatistics,
+)
 from pipeline.pipeline import IngestionPipeline
 
 
diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py
@@ -23,18 +23,48 @@ def apply(self, pipeline: IngestionPipeline):
         pipeline.column_type_map = column_types
 
     def analyze_column(self, column: Series):
-        # is it numeric?
-        values = set(column)
-        types = set([type(value) for value in values])
+        """
+
+        :param column:
+        :return:
+        """
+        values = column.tolist()
+        types = [type(value) for value in values]
+
+        if self.categorical_test(values):
+            return "categorical"
 
-        if self.numeric_test(types):
+        elif self.numeric_test(types):
             return "numeric"
 
         return "object"
 
     @staticmethod
-    def numeric_test(types: set):
-        return all([item == float or item == int for item in types])
+    def categorical_test(values: list):
+        """
+        Tests whether a column is of categorical type.
+        This is decided as the case if the number of unique values is
+        less than 5% of the total number of values in the column.
+
+        :param values: list of values of any type
+        :return: True if attr is categorical, False otherwise
+        """
+        n_total = len(values)
+        n_unique = len(set(values))
+        percentage_unique = n_unique / n_total
+
+        if percentage_unique < 0.05:
+            return True
+        return False
+
+    @staticmethod
+    def numeric_test(types: list):
+        """
+
+        :param types:
+        :return:
+        """
+        return all([item == float or item == int for item in set(types)])
 
     @staticmethod
     def string_test(types: set):
@@ -43,7 +73,3 @@ def string_test(types: set):
     @staticmethod
     def date_check(types: set):
         raise NotImplementedError
-
-    @staticmethod
-    def categorical_test(values: set):
-        raise NotImplementedError
diff --git a/python/src/lazylearn/ingestion/ingestion_pipeline_steps/summary_stats_step.py b/python/src/lazylearn/ingestion/ingestion_pipeline_steps/summary_stats_step.py
@@ -15,7 +15,8 @@ def apply(self, pipeline: IngestionPipeline):
             for column in pipeline.column_type_map
             if pipeline.column_type_map[column] == "numeric"
         ]
-        df = pipeline.df
 
         for attr in numeric_attributes:
-            pipeline.summary_stats[attr] = pipeline.df[attr].describe().to_dict()
+            pipeline.summary_stats[attr] = (
+                pipeline.df[attr].describe().to_dict()
+            )  # noqa
diff --git a/python/src/test/ingestion/ingestion_pipeline_steps/test_summary_stats_step.py b/python/src/test/ingestion/ingestion_pipeline_steps/test_summary_stats_step.py
@@ -1,3 +1,4 @@
+import pandas as pd
 from ingestion.ingestion_pipeline_steps.summary_stats_step import (  # noqa
     SummaryStatistics,
 )
@@ -7,7 +8,7 @@
 
 def test_iris_stats():
     pipeline = IngestionPipeline()
-    pipeline.df = load_iris(return_X_y=True, as_frame=True)[0]
+    pipeline.df = pd.concat(load_iris(return_X_y=True, as_frame=True), axis=1)
     pipeline.column_type_map = {
         "sepal length (cm)": "numeric",
         "sepal width (cm)": "numeric",

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,9 @@`
`2`	`2`	`from ingestion.ingestion_pipeline_steps.interpreter_step import ( # noqa`
`3`	`3`	`ColumnTypeInterpreter,`
`4`	`4`	`)`
`5`		`-from ingestion.ingestion_pipeline_steps.summary_stats_step import SummaryStatistics`
	`5`	`+from ingestion.ingestion_pipeline_steps.summary_stats_step import ( # noqa`
	`6`	`+ SummaryStatistics,`
	`7`	`+)`
`6`	`8`	`from pipeline.pipeline import IngestionPipeline`
`7`	`9`
`8`	`10`