singlestore-labs
diff --git a/‎singlestoredb/functions/ext/asgi.py‎
Lines changed: 11 additions & 8 deletions b/‎singlestoredb/functions/ext/asgi.py‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎singlestoredb/functions/ext/rowdat_1.py‎
Lines changed: 26 additions & 9 deletions b/‎singlestoredb/functions/ext/rowdat_1.py‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎singlestoredb/functions/ext/utils.py‎
Lines changed: 1 addition & 3 deletions b/‎singlestoredb/functions/ext/utils.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎singlestoredb/functions/signature.py‎
Lines changed: 43 additions & 7 deletions b/‎singlestoredb/functions/signature.py‎
Lines changed: 43 additions & 7 deletions
@@ -311,6 +311,7 @@ def cancel_on_event(
 
 def build_udf_endpoint(
     func: Callable[..., Any],
+    args_data_format: str,
     returns_data_format: str,
 ) -> Callable[..., Any]:
     """
@@ -352,11 +353,12 @@ async def do_func(
 
         return do_func
 
-    return build_vector_udf_endpoint(func, returns_data_format)
+    return build_vector_udf_endpoint(func, args_data_format, returns_data_format)
 
 
 def build_vector_udf_endpoint(
     func: Callable[..., Any],
+    args_data_format: str,
     returns_data_format: str,
 ) -> Callable[..., Any]:
     """
@@ -422,6 +424,7 @@ async def do_func(
 
 def build_tvf_endpoint(
     func: Callable[..., Any],
+    args_data_format: str,
     returns_data_format: str,
 ) -> Callable[..., Any]:
     """
@@ -451,27 +454,27 @@ async def do_func(
             rows: Sequence[Sequence[Any]],
         ) -> Tuple[Sequence[int], List[Tuple[Any, ...]]]:
             '''Call function on given rows of data.'''
-            out_ids: List[int] = []
-            out = []
+            out: List[Tuple[Any, ...]] = []
             # Call function on each row of data
             async with timer('call_function'):
+                out = []
                 for i, row in zip(row_ids, rows):
                     cancel_on_event(cancel_event)
                     if is_async:
                         res = await func(*row)
                     else:
                         res = func(*row)
                     out.extend(as_list_of_tuples(res))
-                    out_ids.extend([row_ids[i]] * (len(out)-len(out_ids)))
-            return out_ids, out
+            return [row_ids[0]] * len(out), out
 
         return do_func
 
-    return build_vector_tvf_endpoint(func, returns_data_format)
+    return build_vector_tvf_endpoint(func, args_data_format, returns_data_format)
 
 
 def build_vector_tvf_endpoint(
     func: Callable[..., Any],
+    args_data_format: str,
     returns_data_format: str,
 ) -> Callable[..., Any]:
     """
@@ -575,9 +578,9 @@ def make_func(
     )
 
     if function_type == 'tvf':
-        do_func = build_tvf_endpoint(func, returns_data_format)
+        do_func = build_tvf_endpoint(func, args_data_format, returns_data_format)
     else:
-        do_func = build_udf_endpoint(func, returns_data_format)
+        do_func = build_udf_endpoint(func, args_data_format, returns_data_format)
 
     do_func.__name__ = name
     do_func.__doc__ = func.__doc__
 
@@ -462,7 +462,7 @@ def _dump_vectors(
             default = DEFAULT_VALUES[rtype]
             try:
                 if rtype in numeric_formats:
-                    if value is None:
+                    if is_null or value is None:
                         out.write(struct.pack(numeric_formats[rtype], default))
                     else:
                         if rtype in int_types:
@@ -486,14 +486,14 @@ def _dump_vectors(
                                 ),
                             )
                 elif rtype in string_types:
-                    if value is None:
+                    if is_null or value is None:
                         out.write(struct.pack('<q', 0))
                     else:
                         sval = value.encode('utf-8')
                         out.write(struct.pack('<q', len(sval)))
                         out.write(sval)
                 elif rtype in binary_types:
-                    if value is None:
+                    if is_null or value is None:
                         out.write(struct.pack('<q', 0))
                     else:
                         out.write(struct.pack('<q', len(value)))
@@ -571,8 +571,18 @@ def _load_numpy_accel(
 
     for i, (_, dtype, transformer) in enumerate(colspec):
         if transformer is not None:
-            t = np.vectorize(transformer)
-            numpy_cols[i] = (t(numpy_cols[i][0]), numpy_cols[i][1])
+            # Numpy will try to be "helpful" and create multidimensional arrays
+            # from nested iterables. We don't usually want that. What we want is
+            # numpy arrays of Python objects (e.g., lists, dicts, etc). To do that,
+            # we have to create an empty array of the correct length and dtype=object,
+            # then fill it in with the transformed values. The transformer may have
+            # an output_type attribute that we can use to create a more specific type.
+            if getattr(transformer, 'output_type', None):
+                new_col = np.empty(len(numpy_cols[i][0]), dtype=transformer.output_type)
+                new_col[:] = list(map(transformer, numpy_cols[i][0]))
+            else:
+                new_col = np.array(list(map(transformer, numpy_cols[i][0])))
+            numpy_cols[i] = (new_col, numpy_cols[i][1])
 
     return numpy_ids, numpy_cols
 
@@ -589,8 +599,7 @@ def _dump_numpy_accel(
 
     for i, (_, dtype, transformer) in enumerate(returns):
         if transformer is not None:
-            t = np.vectorize(transformer)
-            cols[i] = (t(cols[i][0]), cols[i][1])
+            cols[i] = (np.array(list(map(transformer, cols[i][0]))), cols[i][1])
 
     return _singlestoredb_accel.dump_rowdat_1_numpy(returns, row_ids, cols)
 
@@ -678,10 +687,18 @@ def _dump_polars_accel(
     if not has_accel:
         raise RuntimeError('could not load SingleStoreDB extension')
 
+    import numpy as np
+    import polars as pl
+
     numpy_ids = row_ids.to_numpy()
     numpy_cols = [
         (
-            data.to_numpy(),
+            # Polars will try to be "helpful" and convert nested iterables into
+            # multidimensional arrays. We don't usually want that. What we want is
+            # numpy arrays of Python objects (e.g., lists, dicts, etc). To
+            # do that, we have to convert the Series to a list first.
+            np.array(data.to_list())
+            if isinstance(data.dtype, (pl.Struct, pl.Object)) else data.to_numpy(),
             mask.to_numpy() if mask is not None else None,
         )
         for data, mask in cols
@@ -722,7 +739,7 @@ def _create_arrow_mask(
     if mask is None:
         return data.is_null().to_numpy(zero_copy_only=False)
 
-    return pc.or_(data.is_null(), mask.is_null()).to_numpy(zero_copy_only=False)
+    return pc.or_(data.is_null(), mask).to_numpy(zero_copy_only=False)
 
 
 def _dump_arrow_accel(
 
@@ -7,7 +7,6 @@
 import zipfile
 from copy import copy
 from typing import Any
-from typing import Callable
 from typing import Dict
 from typing import List
 from typing import Optional
@@ -32,8 +31,7 @@ def formatMessage(self, record: logging.LogRecord) -> str:
             recordcopy.__dict__['levelprefix'] = levelname + ':' + seperator
             return super().formatMessage(recordcopy)
 
-
-Transformer = Callable[..., Any]
+from ..typing import Transformer
 
 
 def apply_transformer(func: Optional[Transformer], v: Any) -> Any:
 
@@ -923,6 +923,7 @@ def get_schema(
     spec: Any,
     overrides: Optional[List[ParamSpec]] = None,
     mode: str = 'parameter',
+    masks: Optional[List[bool]] = None,
 ) -> Tuple[List[ParamSpec], str, str]:
     """
     Expand a return type annotation into a list of types and field names.
@@ -935,6 +936,8 @@ def get_schema(
         List of SQL type specifications for the return type
     mode : str
         The mode of the function, either 'parameter' or 'return'
+    is_masked : bool
+        Whether the type is wrapped in a Masked type
 
     Returns
     -------
@@ -996,7 +999,13 @@ def get_schema(
                         'dataclass, TypedDict, or pydantic model',
                     )
                 spec = typing.get_args(unpacked_spec[0])[0]
-                data_format = 'list'
+                # Lists as output from TVFs are considered scalar outputs
+                # since they correspond to individual Python objects, not
+                # a true vector type.
+                if function_type == 'tvf':
+                    data_format = 'scalar'
+                else:
+                    data_format = 'list'
 
             elif all([utils.is_vector(x, include_masks=True) for x in unpacked_spec]):
                 pass
@@ -1113,7 +1122,11 @@ def get_schema(
             _, inner_apply_meta = unpack_annotated(typing.get_args(spec)[0])
             if inner_apply_meta.sql_type:
                 udf_attrs = inner_apply_meta
-                colspec = get_schema(typing.get_args(spec)[0], mode=mode)[0]
+                colspec = get_schema(
+                    typing.get_args(spec)[0],
+                    mode=mode,
+                    masks=[masks[0]] if masks else None,
+                )[0]
             else:
                 colspec = [
                     ParamSpec(
@@ -1144,6 +1157,7 @@ def get_schema(
                     overrides=[overrides[i]] if overrides else [],
                     # Always pass UDF mode for individual items
                     mode=mode,
+                    masks=[masks[i]] if masks else None,
                 )
 
                 # Use the name from the overrides if specified
@@ -1185,7 +1199,7 @@ def get_schema(
     out = []
 
     # Normalize colspec data types
-    for c in colspec:
+    for i, c in enumerate(colspec):
 
         # if the dtype is a string, it is resolved already
         if isinstance(c.dtype, str):
@@ -1203,13 +1217,27 @@ def get_schema(
                 include_null=c.is_optional,
             )
 
+        sql_type = c.sql_type if isinstance(c.sql_type, str) else udf_attrs.sql_type
+
+        is_optional = (
+            c.is_optional
+            or bool(dtype and dtype.endswith('?'))
+            or bool(masks and masks[i])
+        )
+
+        if is_optional:
+            if dtype and not dtype.endswith('?'):
+                dtype += '?'
+            if sql_type and re.search(r' NOT NULL\b', sql_type):
+                sql_type = re.sub(r' NOT NULL\b', r' NULL', sql_type)
+
         p = ParamSpec(
             name=c.name,
             dtype=dtype,
-            sql_type=c.sql_type if isinstance(c.sql_type, str) else udf_attrs.sql_type,
-            is_optional=c.is_optional or bool(dtype and dtype.endswith('?')),
-            transformer=udf_attrs.input_transformer
-            if mode == 'parameter' else udf_attrs.output_transformer,
+            sql_type=sql_type,
+            is_optional=is_optional,
+            transformer=udf_attrs.args_transformer
+            if mode == 'parameter' else udf_attrs.returns_transformer,
         )
 
         out.append(p)
@@ -1347,6 +1375,7 @@ def get_signature(
             unpack_masked_type(param.annotation),
             overrides=[args_colspec[i]] if args_colspec else [],
             mode='parameter',
+            masks=[args_masks[i]] if args_masks else [],
         )
         args_data_formats.append(args_data_format)
 
@@ -1406,6 +1435,7 @@ def get_signature(
         unpack_masked_type(signature.return_annotation),
         overrides=returns_colspec if returns_colspec else None,
         mode='return',
+        masks=ret_masks or [],
     )
 
     rdf = out['returns_data_format'] = out['returns_data_format'] or 'scalar'
@@ -1421,6 +1451,12 @@ def get_signature(
                 'scalar or vector types.',
             )
 
+    # If we hava function parameters and the function is a TVF, then
+    # the return type should just match the parameter vector types. This ensures
+    # the output producers for scalars and vectors are consistent.
+    elif function_type == 'tvf' and rdf == 'scalar' and args_schema:
+        out['returns_data_format'] = out['args_data_format']
+
     # All functions have to return a value, so if none was specified try to
     # insert a reasonable default that includes NULLs.
     if not ret_schema: