Implements a first version of sort_values. (#24)

sdpython · web-flow · commit a2e15ddd82d5 · 2021-10-18T10:58:08.000+02:00
* Implements a first version of sort_values.
* fix sorting with nan values
diff --git a/_unittests/ut_df/test_dataframe_sort.py b/_unittests/ut_df/test_dataframe_sort.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+"""
+@brief      test log(time=4s)
+"""
+import os
+import unittest
+import pandas
+from pyquickhelper.pycode import ExtTestCase, get_temp_folder
+from pandas_streaming.df import StreamingDataFrame
+
+
+class TestDataFrameSort(ExtTestCase):
+
+    def test_sort_values(self):
+        temp = get_temp_folder(__file__, "temp_sort_values")
+        name = os.path.join(temp, "_data_")
+        df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
+                               dict(a=5, b="f", c=5.7, ind="a2", ai=2),
+                               dict(a=4, b="g", ind="a3", ai=3),
+                               dict(a=8, b="h", c=5.9, ai=4),
+                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        sdf = StreamingDataFrame.read_df(df, chunksize=2)
+        sorted_df = df.sort_values(by="a")
+        res = sdf.sort_values(by="a", temp_file=name)
+        res_df = res.to_df()
+        self.assertEqualDataFrame(sorted_df, res_df)
+
+    def test_sort_values_twice(self):
+        temp = get_temp_folder(__file__, "temp_sort_values_twice")
+        name = os.path.join(temp, "_data_")
+        df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
+                               dict(a=5, b="f", c=5.7, ind="a2", ai=2),
+                               dict(a=4, b="g", ind="a3", ai=3),
+                               dict(a=8, b="h", c=5.9, ai=4),
+                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        sdf = StreamingDataFrame.read_df(df, chunksize=2)
+        sorted_df = df.sort_values(by="a")
+        res = sdf.sort_values(by="a", temp_file=name)
+        res_df = res.to_df()
+        self.assertEqualDataFrame(sorted_df, res_df)
+        res_df = res.to_df()
+        self.assertEqualDataFrame(sorted_df, res_df)
+
+    def test_sort_values_reverse(self):
+        temp = get_temp_folder(__file__, "temp_sort_values_reverse")
+        name = os.path.join(temp, "_data_")
+        df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
+                               dict(a=5, b="f", c=5.7, ind="a2", ai=2),
+                               dict(a=4, b="g", ind="a3", ai=3),
+                               dict(a=8, b="h", c=5.9, ai=4),
+                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        sdf = StreamingDataFrame.read_df(df, chunksize=2)
+        sorted_df = df.sort_values(by="a", ascending=False)
+        res = sdf.sort_values(by="a", temp_file=name, ascending=False)
+        res_df = res.to_df()
+        self.assertEqualDataFrame(sorted_df, res_df)
+
+    def test_sort_values_nan_last(self):
+        temp = get_temp_folder(__file__, "temp_sort_values_nan_last")
+        name = os.path.join(temp, "_data_")
+        df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
+                               dict(b="f", c=5.7, ind="a2", ai=2),
+                               dict(b="f", c=5.8, ind="a2", ai=2),
+                               dict(a=4, b="g", ind="a3", ai=3),
+                               dict(a=8, b="h", c=5.9, ai=4),
+                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        sdf = StreamingDataFrame.read_df(df, chunksize=2)
+        sorted_df = df.sort_values(by="a", na_position='last')
+        res = sdf.sort_values(by="a", temp_file=name, na_position='last')
+        res_df = res.to_df()
+        self.assertEqualDataFrame(sorted_df, res_df)
+
+    def test_sort_values_nan_first(self):
+        temp = get_temp_folder(__file__, "temp_sort_values_nan_first")
+        name = os.path.join(temp, "_data_")
+        df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
+                               dict(b="f", c=5.7, ind="a2", ai=2),
+                               dict(b="f", c=5.8, ind="a2", ai=2),
+                               dict(a=4, b="g", ind="a3", ai=3),
+                               dict(a=8, b="h", c=5.9, ai=4),
+                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        sdf = StreamingDataFrame.read_df(df, chunksize=2)
+        sorted_df = df.sort_values(by="a", na_position='first')
+        res = sdf.sort_values(by="a", temp_file=name, na_position='first')
+        res_df = res.to_df()
+        self.assertEqualDataFrame(sorted_df, res_df)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/_unittests/ut_df/test_streaming_dataframe.py b/_unittests/ut_df/test_streaming_dataframe.py
@@ -9,7 +9,6 @@
 import numpy
 from pyquickhelper.pycode import ExtTestCase, get_temp_folder
 from pandas_streaming.data import dummy_streaming_dataframe
-from pandas_streaming.exc import StreamingInefficientException
 from pandas_streaming.df import StreamingDataFrame
 from pandas_streaming.df.dataframe import StreamingDataFrameSchemaError
 
@@ -23,8 +22,6 @@ def test_shape(self):
         self.assertEqual(len(dfs), 10)
         shape = sdf.shape
         self.assertEqual(shape, (100, 2))
-        self.assertRaise(lambda: sdf.sort_values(
-            "r"), StreamingInefficientException)
 
     def test_init(self):
         sdf = dummy_streaming_dataframe(100)
@@ -557,5 +554,4 @@ def test_set_item_function(self):
 
 
 if __name__ == "__main__":
-    # TestStreamingDataFrame().test_describe()
     unittest.main()
diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py
@@ -3,14 +3,15 @@
 @file
 @brief Defines a streaming dataframe.
 """
+import pickle
+import os
 from io import StringIO, BytesIO
 from inspect import isfunction
 import numpy
 import numpy.random as nrandom
 import pandas
 from pandas.testing import assert_frame_equal
 from pandas.io.json import json_normalize
-from ..exc import StreamingInefficientException
 from .dataframe_split import sklearn_train_test_split, sklearn_train_test_split_streaming
 from .dataframe_io_helpers import enumerate_json_items, JsonIterator2Stream
 
@@ -67,6 +68,7 @@ class StreamingDataFrame:
     """
 
     def __init__(self, iter_creation, check_schema=True, stable=True):
+        self._delete_ = []
         if isinstance(iter_creation, (pandas.DataFrame, dict,
                                       numpy.ndarray, str)):
             raise TypeError(
@@ -407,12 +409,6 @@ def __iter__(self):
             rows += it.shape[0]
             yield it
 
-    def sort_values(self, *args, **kwargs):
-        """
-        Not implemented.
-        """
-        raise StreamingInefficientException(StreamingDataFrame.sort_values)
-
     @property
     def shape(self):
         """
@@ -1118,6 +1114,93 @@ def describe(self, percentiles=None, include=None, exclude=None,
         summary = summary.loc[rows, :]
         return pandas.concat([merged, summary])
 
+    def sort_values(self, by, axis=0, ascending=True, kind='quicksort',
+                    na_position='last',
+                    temp_file='_pandas_streaming_sort_values_'):
+        """
+        Sorts the streaming dataframe by values.
+
+        :param by: one column
+        :param ascending: order
+        :param kind: see :meth:`pandas.DataFrame.sort_values`
+        :param na_position: see :meth:`pandas.DataFrame.sort_values`
+        :param temp_file: sorting a whole database is impossible
+            without storing intermediate results on disk
+            unless it can fit into the memory, but in that case,
+            it is easier to convert the streaming database into
+            a dataframe and sort it
+        :return: streaming database
+        """
+        if not isinstance(by, str):
+            raise NotImplementedError(
+                "Only one column can be used to sort not %r." % by)
+        keys = {}
+        nans = []
+        indices = []
+        with open(temp_file, 'wb') as f:
+            for df in self:
+                dfs = df.sort_values(by, ascending=ascending, kind=kind,
+                                     na_position=na_position)
+                for tu in dfs[by]:
+                    if isinstance(tu, float) and numpy.isnan(tu):
+                        nans.append(len(indices))
+                    else:
+                        if tu not in keys:
+                            keys[tu] = []
+                        keys[tu].append(len(indices))
+                indices.append(f.tell())
+                st = BytesIO()
+                pickle.dump(dfs, st)
+                f.write(st.getvalue())
+
+            indices.append(f.tell())
+
+        values = list(keys.items())
+        values.sort(reverse=not ascending)
+
+        def iterate():
+
+            with open(temp_file, 'rb') as f:
+
+                if na_position == 'first':
+                    for p in nans:
+                        f.seek(indices[p])
+                        length = indices[p + 1] - indices[p]
+                        pkl = f.read(length)
+                        dfs = pickle.load(BytesIO(pkl))
+                        sub = dfs[numpy.isnan(dfs[by])]
+                        yield sub
+
+                for key, positions in values:
+                    for p in positions:
+                        f.seek(indices[p])
+                        length = indices[p + 1] - indices[p]
+                        pkl = f.read(length)
+                        dfs = pickle.load(BytesIO(pkl))
+                        sub = dfs[dfs[by] == key]
+                        yield sub
+
+                if na_position == 'last':
+                    for p in nans:
+                        f.seek(indices[p])
+                        length = indices[p + 1] - indices[p]
+                        pkl = f.read(length)
+                        dfs = pickle.load(BytesIO(pkl))
+                        sub = dfs[numpy.isnan(dfs[by])]
+                        yield sub
+
+        res = StreamingDataFrame(
+            lambda: iterate(), **self.get_kwargs())
+        res._delete_.append(lambda: os.remove(temp_file))
+        return res
+
+    def __del__(self):
+        """
+        Calls every function in `_delete_`.
+        """
+        for f in self._delete_:
+            f()
+
 
 class StreamingSeries(StreamingDataFrame):
     """