1717import numpy as np
1818
1919from pandas ._libs import lib
20+ from pandas ._libs .missing import NA
2021from pandas ._libs .tslibs import (
2122 Timedelta ,
2223 Timestamp ,
@@ -353,7 +354,7 @@ def _from_sequence_of_strings(
353354 # duration to string casting behavior
354355 mask = isna (scalars )
355356 if not isinstance (strings , (pa .Array , pa .ChunkedArray )):
356- strings = pa .array (strings , type = pa .string (), from_pandas = True )
357+ strings = pa .array (strings , type = pa .string ())
357358 strings = pc .if_else (mask , None , strings )
358359 try :
359360 scalars = strings .cast (pa .int64 ())
@@ -374,7 +375,7 @@ def _from_sequence_of_strings(
374375 if isinstance (strings , (pa .Array , pa .ChunkedArray )):
375376 scalars = strings
376377 else :
377- scalars = pa .array (strings , type = pa .string (), from_pandas = True )
378+ scalars = pa .array (strings , type = pa .string ())
378379 scalars = pc .if_else (pc .equal (scalars , "1.0" ), "1" , scalars )
379380 scalars = pc .if_else (pc .equal (scalars , "0.0" ), "0" , scalars )
380381 scalars = scalars .cast (pa .bool_ ())
@@ -386,6 +387,13 @@ def _from_sequence_of_strings(
386387 from pandas .core .tools .numeric import to_numeric
387388
388389 scalars = to_numeric (strings , errors = "raise" )
390+ if not pa .types .is_decimal (pa_type ):
391+ # TODO: figure out why doing this cast breaks with decimal dtype
392+ # in test_from_sequence_of_strings_pa_array
393+ mask = strings .is_null ()
394+ scalars = pa .array (scalars , mask = np .array (mask ), type = pa_type )
395+ # TODO: could we just do strings.cast(pa_type)?
396+
389397 else :
390398 raise NotImplementedError (
391399 f"Converting strings to { pa_type } is not implemented."
@@ -428,7 +436,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
428436 """
429437 if isinstance (value , pa .Scalar ):
430438 pa_scalar = value
431- elif isna (value ):
439+ elif isna (value ) and not lib . is_float ( value ) :
432440 pa_scalar = pa .scalar (None , type = pa_type )
433441 else :
434442 # Workaround https://github.com/apache/arrow/issues/37291
@@ -445,7 +453,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
445453 value = value .as_unit (pa_type .unit )
446454 value = value ._value
447455
448- pa_scalar = pa .scalar (value , type = pa_type , from_pandas = True )
456+ pa_scalar = pa .scalar (value , type = pa_type )
449457
450458 if pa_type is not None and pa_scalar .type != pa_type :
451459 pa_scalar = pa_scalar .cast (pa_type )
@@ -477,6 +485,13 @@ def _box_pa_array(
477485 if copy :
478486 value = value .copy ()
479487 pa_array = value .__arrow_array__ ()
488+
489+ elif hasattr (value , "__arrow_array__" ):
490+ # e.g. StringArray
491+ if copy :
492+ value = value .copy ()
493+ pa_array = value .__arrow_array__ ()
494+
480495 else :
481496 if (
482497 isinstance (value , np .ndarray )
@@ -530,19 +545,32 @@ def _box_pa_array(
530545 pa_array = pa .array (dta ._ndarray , type = pa_type , mask = dta_mask )
531546 return pa_array
532547
548+ mask = None
549+ if getattr (value , "dtype" , None ) is None or value .dtype .kind not in "mfM" :
550+ # similar to isna(value) but exclude NaN
551+ # TODO: cythonize!
552+ mask = np .array ([x is NA or x is None for x in value ], dtype = bool )
553+
554+ from_pandas = False
555+ if pa .types .is_integer (pa_type ):
556+ # If user specifically asks to cast a numpy float array with NaNs
557+ # to pyarrow integer, we'll treat those NaNs as NA
558+ from_pandas = True
533559 try :
534- pa_array = pa .array (value , type = pa_type , from_pandas = True )
560+ pa_array = pa .array (
561+ value , type = pa_type , mask = mask , from_pandas = from_pandas
562+ )
535563 except (pa .ArrowInvalid , pa .ArrowTypeError ):
536564 # GH50430: let pyarrow infer type, then cast
537- pa_array = pa .array (value , from_pandas = True )
565+ pa_array = pa .array (value , mask = mask , from_pandas = from_pandas )
538566
539567 if pa_type is None and pa .types .is_duration (pa_array .type ):
540568 # Workaround https://github.com/apache/arrow/issues/37291
541569 from pandas .core .tools .timedeltas import to_timedelta
542570
543571 value = to_timedelta (value )
544572 value = value .to_numpy ()
545- pa_array = pa .array (value , type = pa_type , from_pandas = True )
573+ pa_array = pa .array (value , type = pa_type )
546574
547575 if pa .types .is_duration (pa_array .type ) and pa_array .null_count > 0 :
548576 # GH52843: upstream bug for duration types when originally
@@ -1208,7 +1236,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
12081236 if not len (values ):
12091237 return np .zeros (len (self ), dtype = bool )
12101238
1211- result = pc .is_in (self ._pa_array , value_set = pa .array (values , from_pandas = True ))
1239+ result = pc .is_in (self ._pa_array , value_set = pa .array (values ))
12121240 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
12131241 # to False
12141242 return np .array (result , dtype = np .bool_ )
@@ -2015,7 +2043,7 @@ def __setitem__(self, key, value) -> None:
20152043 raise ValueError ("Length of indexer and values mismatch" )
20162044 chunks = [
20172045 * self ._pa_array [:key ].chunks ,
2018- pa .array ([value ], type = self ._pa_array .type , from_pandas = True ),
2046+ pa .array ([value ], type = self ._pa_array .type ),
20192047 * self ._pa_array [key + 1 :].chunks ,
20202048 ]
20212049 data = pa .chunked_array (chunks ).combine_chunks ()
@@ -2069,7 +2097,7 @@ def _rank_calc(
20692097 pa_type = pa .float64 ()
20702098 else :
20712099 pa_type = pa .uint64 ()
2072- result = pa .array (ranked , type = pa_type , from_pandas = True )
2100+ result = pa .array (ranked , type = pa_type )
20732101 return result
20742102
20752103 data = self ._pa_array .combine_chunks ()
@@ -2321,7 +2349,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
23212349 right , right_type = _to_numpy_and_type (right )
23222350 pa_type = left_type or right_type
23232351 result = np .where (cond , left , right )
2324- return pa .array (result , type = pa_type , from_pandas = True )
2352+ return pa .array (result , type = pa_type )
23252353
23262354 @classmethod
23272355 def _replace_with_mask (
@@ -2364,7 +2392,7 @@ def _replace_with_mask(
23642392 replacements = replacements .as_py ()
23652393 result = np .array (values , dtype = object )
23662394 result [mask ] = replacements
2367- return pa .array (result , type = values .type , from_pandas = True )
2395+ return pa .array (result , type = values .type )
23682396
23692397 # ------------------------------------------------------------------
23702398 # GroupBy Methods
@@ -2443,7 +2471,7 @@ def _groupby_op(
24432471 return type (self )(pa_result )
24442472 else :
24452473 # DatetimeArray, TimedeltaArray
2446- pa_result = pa .array (result , from_pandas = True )
2474+ pa_result = pa .array (result )
24472475 return type (self )(pa_result )
24482476
24492477 def _apply_elementwise (self , func : Callable ) -> list [list [Any ]]:
0 commit comments