Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import numpy as np
cnp.import_array()

from pandas._libs.tslibs.dtypes cimport (
abbrev_to_npy_unit,
get_supported_reso,
npy_unit_to_abbrev,
)
Expand Down Expand Up @@ -312,7 +313,7 @@ cpdef array_to_datetime(
_TSObject tsobj
tzinfo tz, tz_out = None
cnp.flatiter it = cnp.PyArray_IterNew(values)
NPY_DATETIMEUNIT item_reso
NPY_DATETIMEUNIT item_reso, int_reso
bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC
DatetimeParseState state = DatetimeParseState(creso)
str abbrev
Expand All @@ -325,11 +326,11 @@ cpdef array_to_datetime(
else:
abbrev = npy_unit_to_abbrev(creso)

if unit_for_numerics is not None:
# either creso or unit_for_numerics should be passed, not both
assert creso == NPY_FR_ns
else:
if unit_for_numerics is None:
unit_for_numerics = abbrev
int_reso = NPY_FR_ns
else:
int_reso = get_supported_reso(abbrev_to_npy_unit(unit_for_numerics))

result = np.empty((<object>values).shape, dtype=f"M8[{abbrev}]")
iresult = result.view("i8").ravel()
Expand Down Expand Up @@ -370,7 +371,20 @@ cpdef array_to_datetime(
iresult[i] = get_datetime64_nanos(val, creso)
state.found_other = True

elif is_integer_object(val) or is_float_object(val):
elif is_integer_object(val):
if val == NPY_NAT:
iresult[i] = NPY_NAT
else:
item_reso = int_reso
state.update_creso(item_reso)
if infer_reso:
creso = state.creso

iresult[i] = cast_from_unit(val, unit_for_numerics, out_reso=creso)

state.found_other = True

elif is_float_object(val):
# these must be ns unit by-definition

if val != val or val == NPY_NAT:
Expand Down Expand Up @@ -460,6 +474,7 @@ cpdef array_to_datetime(
dayfirst=dayfirst,
utc=utc,
creso=state.creso,
unit_for_numerics=unit_for_numerics,
)
elif state.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
# i.e. we never encountered anything non-NaT, default to "s". This
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
Timedelta,
Timestamp,
astype_overflowsafe,
get_supported_dtype,
is_supported_dtype,
timezones as libtimezones,
)
from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
from pandas._libs.tslibs.dtypes import NpyDatetimeUnit
from pandas._libs.tslibs.parsing import (
DateParseError,
guess_datetime_format,
Expand Down Expand Up @@ -503,8 +503,9 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
# Note we can't do "f" here because that could induce unwanted
# rounding GH#14156, GH#20445
arr = arg.astype(f"datetime64[{unit}]", copy=False)
dtype = get_supported_dtype(arr.dtype)
try:
arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False)
arr = astype_overflowsafe(arr, dtype, copy=False)
except OutOfBoundsDatetime:
if errors == "raise":
raise
Expand Down Expand Up @@ -534,7 +535,6 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
utc=utc,
errors=errors,
unit_for_numerics=unit,
creso=cast(int, NpyDatetimeUnit.NPY_FR_ns.value),
)

result = DatetimeIndex(arr, name=name)
Expand Down Expand Up @@ -904,7 +904,7 @@ def to_datetime(

>>> pd.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01"))
DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'],
dtype='datetime64[ns]', freq=None)
dtype='datetime64[s]', freq=None)

**Differences with strptime behavior**

Expand Down
6 changes: 5 additions & 1 deletion pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -1312,7 +1312,11 @@ def _try_convert_to_date(self, data: Series) -> Series:
date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
for date_unit in date_units:
try:
return to_datetime(new_data, errors="raise", unit=date_unit)
# Without this as_unit cast, we would fail to overflow
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

something like this:

Suggested change
# Without this as_unit cast, we would fail to overflow
# In case of multiple possible units, infer the likely unit based on the first unit
# for which the parsed dates fit within the nanoseconds bounds
# -> do as_unit cast to ensure OutOfBounds error

# and get much-too-large dates
return to_datetime(new_data, errors="raise", unit=date_unit).dt.as_unit(
"ns"
Comment on lines +1315 to +1318
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not directly understanding that comment. new_data are integers here? Why does the return unit of this function need to be nanoseconds? (to preserve current functionality?) Why would this give (wrong?) too large dates?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is inside a block that tries large units and if they overflow then tries smaller units. This PR makes the large units not-overflow in cases where this piece of code expects them to. Without this edit, e.g. pandas/tests/io/json/test_pandas.py::TestPandasContainer::test_date_unit fails with

left = DatetimeIndex(['30004724859-08-03', '30007462766-08-06', '30010200673-08-08',
               '30012938580-08-10', '300...        '30106027418-11-06', '30108765325-11-08', '30111503232-11-10'],
              dtype='datetime64[s]', freq=None)
right = DatetimeIndex(['2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06',
               '2000-01-07', '2000-01-10', '200...2000-02-08', '2000-02-09',
               '2000-02-10', '2000-02-11'],
              dtype='datetime64[ns]', freq=None)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is my previous comment clear? and if so, suggestions for how to adapt that to a clearer code comment?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is inside a block that tries large units and if they overflow then tries smaller units.

OK, that was the context I was missing. But, then I still don't entirely get how this was currently working.

The dates you show above like '2000-01-03' fit in the range of all units. So how would the integer value for that ever overflow?

If I put a breakpoint specifically for Overflow on the line below, and run the full test_pandas.py file, I don't get a catch

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fetched the branch to play a bit with the tests: I was misled by the OverflowError, because it is OutOfBoundsDatetime that is being raised when trying to cast to nanoseconds.

So essentially this "infer unit" code assumes that the integer value came from a timestamp that originally had a nanosecond resolution (or at least that should fit in a nanosecond resolution)? Which makes sense from the time we only supported ns.
(Nowadays though .. but that is for another issue)

We could also check this by doing a manual bounds check instead of the casting? (I don't if we have an existing helper function for that)? So we could keep the logic of the inference of the date_unit, but then keep the actual returned data in that unit, instead of forcing it to nanoseconds

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(also, for the case where the user specifies the unit, we so we don't have to infer, we actually don't need to force cast to nanoseconds / check bounds, because that restriction is then not needed)

)
except (ValueError, OverflowError, TypeError):
continue
return data
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,7 @@ def test_date_format_frame_raises(self, datetime_frame):
],
)
def test_date_format_series(self, date, date_unit, datetime_series):
ts = Series(Timestamp(date).as_unit("ns"), index=datetime_series.index)
ts = Series(Timestamp(date), index=datetime_series.index)
ts.iloc[1] = pd.NaT
ts.iloc[5] = pd.NaT
if date_unit:
Expand All @@ -964,7 +964,7 @@ def test_date_format_series(self, date, date_unit, datetime_series):
json = ts.to_json(date_format="iso")

result = read_json(StringIO(json), typ="series")
expected = ts.copy()
expected = ts.copy().dt.as_unit("ns")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not for this PR, but so this is another case where we currently return ns unit but could change to use us by default?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but im inclined to leave that to will to decide.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general I think we should use the same default of microseconds whenever we infer / parse strings, and so for IO formats that means whenever they don't store any type / unit information (in contrast to eg parquet). We already do that for csv, html, excel, etc, so I don't think there is a reason to not do that for JSON. But opened #63442 to track that.

tm.assert_series_equal(result, expected)

def test_date_format_series_raises(self, datetime_series):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/resample/test_resampler_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ def test_groupby_resample_empty_sum_string(
result = gbrs.sum(min_count=min_count)

index = pd.MultiIndex(
levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]],
levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns").as_unit("ns")]],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did this PR change that? (that this no longer returns nanoseconds)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but i didn't realize it until I just checked. I thought this PR only affected integer cases. I also didn't think on main that the unit keyword would have any effect in this case. So there's at least two things I need to look into.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK i think ive figured this out. By passing unit we go down a path that in main always gives "ns" (so the fact that unit="ns" here is irrelevant). Adding as_unit just restores the behavior in main.

codes=[[0, 1, 2], [0, 0, 0]],
names=["A", None],
)
Expand Down
39 changes: 23 additions & 16 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1782,7 +1782,8 @@ class TestToDatetimeUnit:
def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request):
# GH#50870 Note we have separate tests that pd.Timestamp gets these right
ts = Timestamp(item, unit=unit)
expected = DatetimeIndex([ts], dtype="M8[ns]")
dtype = "M8[ns]" if isinstance(item, float) else "M8[s]"
expected = DatetimeIndex([ts], dtype=dtype)

result = to_datetime([item], unit=unit, cache=cache)
tm.assert_index_equal(result, expected)
Expand All @@ -1796,7 +1797,7 @@ def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request):
# with a nan!
result = to_datetime(np.array([item, np.nan]), unit=unit, cache=cache)
assert result.isna()[1]
tm.assert_index_equal(result[:1], expected)
tm.assert_index_equal(result[:1], expected.astype("M8[ns]"))

@pytest.mark.parametrize("unit", ["Y", "M"])
def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
Expand All @@ -1820,12 +1821,12 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
# In 3.0, the string "1.5" is parsed as as it would be without unit,
# which fails. With errors="coerce" this becomes NaT.
res = to_datetime(["1.5"], unit=unit, errors="coerce")
expected = to_datetime([NaT]).as_unit("ns")
expected = to_datetime([NaT])
tm.assert_index_equal(res, expected)

# round floats are OK
res = to_datetime([1.0], unit=unit)
expected = to_datetime([1], unit=unit)
expected = to_datetime([1], unit=unit).as_unit("ns")
tm.assert_index_equal(res, expected)

def test_unit(self, cache):
Expand Down Expand Up @@ -1853,7 +1854,7 @@ def test_unit_array_mixed_nans_large_int(self, cache):
values = [1420043460000000000000000, iNaT, NaT, np.nan, "NaT"]

result = to_datetime(values, errors="coerce", unit="s", cache=cache)
expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"], dtype="M8[ns]")
expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"], dtype="M8[s]")
tm.assert_index_equal(result, expected)

msg = "cannot convert input 1420043460000000000000000 with the unit 's'"
Expand Down Expand Up @@ -1950,12 +1951,13 @@ def test_to_datetime_unit(self, dtype):
epoch = 1370745748
ser = Series([epoch + t for t in range(20)]).astype(dtype)
result = to_datetime(ser, unit="s")
unit = "s" if dtype is int else "ns"
expected = Series(
[
Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t)
for t in range(20)
],
dtype="M8[ns]",
dtype=f"M8[{unit}]",
)
tm.assert_series_equal(result, expected)

Expand All @@ -1964,10 +1966,13 @@ def test_to_datetime_unit_with_nulls(self, null):
epoch = 1370745748
ser = Series([epoch + t for t in range(20)] + [null])
result = to_datetime(ser, unit="s")
# With np.nan, the list gets cast to a float64 array, which always
# gets ns unit.
unit = "ns" if null is np.nan else "s"
expected = Series(
[Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)]
+ [NaT],
dtype="M8[ns]",
dtype=f"M8[{unit}]",
)
tm.assert_series_equal(result, expected)

Expand All @@ -1992,25 +1997,25 @@ def test_to_datetime_unit_na_values(self):
result = to_datetime([1, 2, "NaT", NaT, np.nan], unit="D")
expected = DatetimeIndex(
[Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3,
dtype="M8[ns]",
dtype="M8[s]",
)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize("bad_val", ["foo", 111111111])
@pytest.mark.parametrize("bad_val", ["foo", 111111111111111])
def test_to_datetime_unit_invalid(self, bad_val):
if bad_val == "foo":
msg = f"Unknown datetime string format, unable to parse: {bad_val}"
else:
msg = "cannot convert input 111111111 with the unit 'D'"
msg = "cannot convert input 111111111111111 with the unit 'D'"
with pytest.raises(ValueError, match=msg):
to_datetime([1, 2, bad_val], unit="D")

@pytest.mark.parametrize("bad_val", ["foo", 111111111])
@pytest.mark.parametrize("bad_val", ["foo", 111111111111111])
def test_to_timestamp_unit_coerce(self, bad_val):
# coerce we can process
expected = DatetimeIndex(
[Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1,
dtype="M8[ns]",
dtype="M8[s]",
)
result = to_datetime([1, 2, bad_val], unit="D", errors="coerce")
tm.assert_index_equal(result, expected)
Expand Down Expand Up @@ -3223,7 +3228,7 @@ def test_unix(self):
result = Series(to_datetime([0, 1, 2], unit="D", origin="unix"))
expected = Series(
[Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")],
dtype="M8[ns]",
dtype="M8[s]",
)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -3262,8 +3267,10 @@ def test_invalid_origin(self, unit):
def test_epoch(self, units, epochs):
epoch_1960 = Timestamp(1960, 1, 1)
units_from_epochs = np.arange(5, dtype=np.int64)
exp_unit = "s" if units == "D" else units
expected = Series(
[pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs]
[pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs],
dtype=f"M8[{exp_unit}]",
)

result = Series(to_datetime(units_from_epochs, unit=units, origin=epochs))
Expand Down Expand Up @@ -3358,7 +3365,7 @@ def test_arg_tz_ns_unit(self, offset, utc, exp):
# GH 25546
arg = "2019-01-01T00:00:00.000" + offset
result = to_datetime([arg], unit="ns", utc=utc)
expected = to_datetime([exp]).as_unit("ns")
expected = to_datetime([exp]).as_unit("us")
tm.assert_index_equal(result, expected)


Expand Down Expand Up @@ -3458,7 +3465,7 @@ def test_empty_string_datetime_coerce__unit():
# GH13044
# coerce empty string to pd.NaT
result = to_datetime([1, ""], unit="s", errors="coerce")
expected = DatetimeIndex(["1970-01-01 00:00:01", "NaT"], dtype="datetime64[ns]")
expected = DatetimeIndex(["1970-01-01 00:00:01", "NaT"], dtype="datetime64[s]")
tm.assert_index_equal(expected, result)

# verify that no exception is raised even when errors='raise' is set
Expand Down
Loading