Skip to content

Commit 638bd9c

Browse files
committed
Merge branch 'master' into enh-masked-2d
2 parents 7a6c226 + bcf2406 commit 638bd9c

File tree

8 files changed

+141
-146
lines changed

8 files changed

+141
-146
lines changed

doc/source/whatsnew/v1.3.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,7 @@ I/O
390390
- Bug in :func:`read_json` when ``orient="split"`` does not maintain numeric string index (:issue:`28556`)
391391
- :meth:`read_sql` returned an empty generator if ``chunksize`` was no-zero and the query returned no results. Now returns a generator with a single empty dataframe (:issue:`34411`)
392392
- Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
393+
- Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`)
393394

394395
Period
395396
^^^^^^
@@ -430,6 +431,7 @@ Reshaping
430431
- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
431432
- Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`)
432433
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)
434+
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`)
433435

434436
Sparse
435437
^^^^^^

pandas/core/dtypes/concat.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
6161
return arr.astype(dtype, copy=False)
6262

6363

64-
def concat_compat(to_concat, axis: int = 0):
64+
def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False):
6565
"""
6666
provide concatenation of an array of arrays each of which is a single
6767
'normalized' dtypes (in that for example, if it's object, then it is a
@@ -72,6 +72,9 @@ def concat_compat(to_concat, axis: int = 0):
7272
----------
7373
to_concat : array of arrays
7474
axis : axis to provide concatenation
75+
ea_compat_axis : bool, default False
76+
For ExtensionArray compat, behave as if axis == 1 when determining
77+
whether to drop empty arrays.
7578
7679
Returns
7780
-------
@@ -91,7 +94,8 @@ def is_nonempty(x) -> bool:
9194
# marginal given that it would still require shape & dtype calculation and
9295
# np.concatenate which has them both implemented is compiled.
9396
non_empties = [x for x in to_concat if is_nonempty(x)]
94-
if non_empties and axis == 0:
97+
if non_empties and axis == 0 and not ea_compat_axis:
98+
# ea_compat_axis see GH#39574
9599
to_concat = non_empties
96100

97101
kinds = {obj.dtype.kind for obj in to_concat}

pandas/core/internals/concat.py

Lines changed: 44 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
from __future__ import annotations
22

3-
from collections import defaultdict
43
import copy
54
import itertools
6-
from typing import TYPE_CHECKING, Dict, List, Sequence, cast
5+
from typing import TYPE_CHECKING, Dict, List, Sequence
76

87
import numpy as np
98

@@ -14,16 +13,13 @@
1413
from pandas.core.dtypes.cast import ensure_dtype_can_hold_na, find_common_type
1514
from pandas.core.dtypes.common import (
1615
is_categorical_dtype,
17-
is_datetime64_dtype,
1816
is_datetime64tz_dtype,
17+
is_dtype_equal,
1918
is_extension_array_dtype,
20-
is_float_dtype,
21-
is_numeric_dtype,
2219
is_sparse,
23-
is_timedelta64_dtype,
2420
)
2521
from pandas.core.dtypes.concat import concat_compat
26-
from pandas.core.dtypes.missing import isna_all
22+
from pandas.core.dtypes.missing import is_valid_na_for_dtype, isna_all
2723

2824
import pandas.core.algorithms as algos
2925
from pandas.core.arrays import DatetimeArray, ExtensionArray
@@ -33,7 +29,6 @@
3329

3430
if TYPE_CHECKING:
3531
from pandas import Index
36-
from pandas.core.arrays.sparse.dtype import SparseDtype
3732

3833

3934
def concatenate_block_managers(
@@ -232,6 +227,29 @@ def dtype(self):
232227
return blk.dtype
233228
return ensure_dtype_can_hold_na(blk.dtype)
234229

230+
def is_valid_na_for(self, dtype: DtypeObj) -> bool:
231+
"""
232+
Check that we are all-NA of a type/dtype that is compatible with this dtype.
233+
Augments `self.is_na` with an additional check of the type of NA values.
234+
"""
235+
if not self.is_na:
236+
return False
237+
if self.block is None:
238+
return True
239+
240+
if self.dtype == object:
241+
values = self.block.values
242+
return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))
243+
244+
if self.dtype.kind == dtype.kind == "M" and not is_dtype_equal(
245+
self.dtype, dtype
246+
):
247+
# fill_values match but we should not cast self.block.values to dtype
248+
return False
249+
250+
na_value = self.block.fill_value
251+
return is_valid_na_for_dtype(na_value, dtype)
252+
235253
@cache_readonly
236254
def is_na(self) -> bool:
237255
if self.block is None:
@@ -262,7 +280,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
262280
else:
263281
fill_value = upcasted_na
264282

265-
if self.is_na:
283+
if self.is_valid_na_for(empty_dtype):
266284
blk_dtype = getattr(self.block, "dtype", None)
267285

268286
if blk_dtype == np.dtype(object):
@@ -276,10 +294,9 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
276294
if is_datetime64tz_dtype(blk_dtype) or is_datetime64tz_dtype(
277295
empty_dtype
278296
):
279-
if self.block is None:
280-
# TODO(EA2D): special case unneeded with 2D EAs
281-
i8values = np.full(self.shape[1], fill_value.value)
282-
return DatetimeArray(i8values, dtype=empty_dtype)
297+
# TODO(EA2D): special case unneeded with 2D EAs
298+
i8values = np.full(self.shape[1], fill_value.value)
299+
return DatetimeArray(i8values, dtype=empty_dtype)
283300
elif is_categorical_dtype(blk_dtype):
284301
pass
285302
elif is_extension_array_dtype(blk_dtype):
@@ -295,6 +312,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
295312
empty_arr, allow_fill=True, fill_value=fill_value
296313
)
297314
else:
315+
# NB: we should never get here with empty_dtype integer or bool;
316+
# if we did, the missing_arr.fill would cast to gibberish
298317
missing_arr = np.empty(self.shape, dtype=empty_dtype)
299318
missing_arr.fill(fill_value)
300319
return missing_arr
@@ -362,14 +381,12 @@ def _concatenate_join_units(
362381
# concatting with at least one EA means we are concatting a single column
363382
# the non-EA values are 2D arrays with shape (1, n)
364383
to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
365-
concat_values = concat_compat(to_concat, axis=0)
366-
if not isinstance(concat_values, ExtensionArray) or (
367-
isinstance(concat_values, DatetimeArray) and concat_values.tz is None
368-
):
384+
concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
385+
if not is_extension_array_dtype(concat_values.dtype):
369386
# if the result of concat is not an EA but an ndarray, reshape to
370387
# 2D to put it a non-EA Block
371-
# special case DatetimeArray, which *is* an EA, but is put in a
372-
# consolidated 2D block
388+
# special case DatetimeArray/TimedeltaArray, which *is* an EA, but
389+
# is put in a consolidated 2D block
373390
concat_values = np.atleast_2d(concat_values)
374391
else:
375392
concat_values = concat_compat(to_concat, axis=concat_axis)
@@ -419,108 +436,17 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
419436
return empty_dtype
420437

421438
has_none_blocks = any(unit.block is None for unit in join_units)
422-
dtypes = [None if unit.block is None else unit.dtype for unit in join_units]
423439

424-
filtered_dtypes = [
440+
dtypes = [
425441
unit.dtype for unit in join_units if unit.block is not None and not unit.is_na
426442
]
427-
if not len(filtered_dtypes):
428-
filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None]
429-
dtype_alt = find_common_type(filtered_dtypes)
430-
431-
upcast_classes = _get_upcast_classes(join_units, dtypes)
432-
433-
if is_extension_array_dtype(dtype_alt):
434-
return dtype_alt
435-
elif dtype_alt == object:
436-
return dtype_alt
437-
438-
# TODO: de-duplicate with maybe_promote?
439-
# create the result
440-
if "extension" in upcast_classes:
441-
return np.dtype("object")
442-
elif "bool" in upcast_classes:
443-
if has_none_blocks:
444-
return np.dtype(np.object_)
445-
else:
446-
return np.dtype(np.bool_)
447-
elif "datetimetz" in upcast_classes:
448-
# GH-25014. We use NaT instead of iNaT, since this eventually
449-
# ends up in DatetimeArray.take, which does not allow iNaT.
450-
dtype = upcast_classes["datetimetz"]
451-
return dtype[0]
452-
elif "datetime" in upcast_classes:
453-
return np.dtype("M8[ns]")
454-
elif "timedelta" in upcast_classes:
455-
return np.dtype("m8[ns]")
456-
else:
457-
try:
458-
common_dtype = np.find_common_type(upcast_classes, [])
459-
except TypeError:
460-
# At least one is an ExtensionArray
461-
return np.dtype(np.object_)
462-
else:
463-
if is_float_dtype(common_dtype):
464-
return common_dtype
465-
elif is_numeric_dtype(common_dtype):
466-
if has_none_blocks:
467-
return np.dtype(np.float64)
468-
else:
469-
return common_dtype
470-
471-
msg = "invalid dtype determination in get_concat_dtype"
472-
raise AssertionError(msg)
473-
474-
475-
def _get_upcast_classes(
476-
join_units: Sequence[JoinUnit],
477-
dtypes: Sequence[DtypeObj],
478-
) -> Dict[str, List[DtypeObj]]:
479-
"""Create mapping between upcast class names and lists of dtypes."""
480-
upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
481-
null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list)
482-
for dtype, unit in zip(dtypes, join_units):
483-
if dtype is None:
484-
continue
485-
486-
upcast_cls = _select_upcast_cls_from_dtype(dtype)
487-
# Null blocks should not influence upcast class selection, unless there
488-
# are only null blocks, when same upcasting rules must be applied to
489-
# null upcast classes.
490-
if unit.is_na:
491-
null_upcast_classes[upcast_cls].append(dtype)
492-
else:
493-
upcast_classes[upcast_cls].append(dtype)
494-
495-
if not upcast_classes:
496-
upcast_classes = null_upcast_classes
497-
498-
return upcast_classes
499-
500-
501-
def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str:
502-
"""Select upcast class name based on dtype."""
503-
if is_categorical_dtype(dtype):
504-
return "extension"
505-
elif is_datetime64tz_dtype(dtype):
506-
return "datetimetz"
507-
elif is_extension_array_dtype(dtype):
508-
return "extension"
509-
elif issubclass(dtype.type, np.bool_):
510-
return "bool"
511-
elif issubclass(dtype.type, np.object_):
512-
return "object"
513-
elif is_datetime64_dtype(dtype):
514-
return "datetime"
515-
elif is_timedelta64_dtype(dtype):
516-
return "timedelta"
517-
elif is_sparse(dtype):
518-
dtype = cast("SparseDtype", dtype)
519-
return dtype.subtype.name
520-
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
521-
return dtype.name
522-
else:
523-
return "float"
443+
if not len(dtypes):
444+
dtypes = [unit.dtype for unit in join_units if unit.block is not None]
445+
446+
dtype = find_common_type(dtypes)
447+
if has_none_blocks:
448+
dtype = ensure_dtype_can_hold_na(dtype)
449+
return dtype
524450

525451

526452
def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool:

pandas/io/sas/sas7bdat.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,28 @@
2323
from pandas.errors import EmptyDataError, OutOfBoundsDatetime
2424

2525
import pandas as pd
26+
from pandas import isna
2627

2728
from pandas.io.common import get_handle
2829
from pandas.io.sas._sas import Parser
2930
import pandas.io.sas.sas_constants as const
3031
from pandas.io.sas.sasreader import ReaderBase
3132

3233

34+
def _parse_datetime(sas_datetime: float, unit: str):
35+
if isna(sas_datetime):
36+
return pd.NaT
37+
38+
if unit == "s":
39+
return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime)
40+
41+
elif unit == "d":
42+
return datetime(1960, 1, 1) + timedelta(days=sas_datetime)
43+
44+
else:
45+
raise ValueError("unit must be 'd' or 's'")
46+
47+
3348
def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
3449
"""
3550
Convert to Timestamp if possible, otherwise to datetime.datetime.
@@ -51,20 +66,9 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
5166
try:
5267
return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01")
5368
except OutOfBoundsDatetime:
54-
if unit == "s":
55-
s_series = sas_datetimes.apply(
56-
lambda sas_float: datetime(1960, 1, 1) + timedelta(seconds=sas_float)
57-
)
58-
s_series = cast(pd.Series, s_series)
59-
return s_series
60-
elif unit == "d":
61-
d_series = sas_datetimes.apply(
62-
lambda sas_float: datetime(1960, 1, 1) + timedelta(days=sas_float)
63-
)
64-
d_series = cast(pd.Series, d_series)
65-
return d_series
66-
else:
67-
raise ValueError("unit must be 'd' or 's'")
69+
s_series = sas_datetimes.apply(_parse_datetime, unit=unit)
70+
s_series = cast(pd.Series, s_series)
71+
return s_series
6872

6973

7074
class _SubheaderPointer:
128 KB
Binary file not shown.

pandas/tests/io/sas/test_sas7bdat.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,3 +315,22 @@ def test_max_sas_date_iterator(datapath):
315315
]
316316
for result, expected in zip(results, expected):
317317
tm.assert_frame_equal(result, expected)
318+
319+
320+
def test_null_date(datapath):
321+
fname = datapath("io", "sas", "data", "dates_null.sas7bdat")
322+
df = pd.read_sas(fname, encoding="utf-8")
323+
324+
expected = pd.DataFrame(
325+
{
326+
"datecol": [
327+
datetime(9999, 12, 29),
328+
pd.NaT,
329+
],
330+
"datetimecol": [
331+
datetime(9999, 12, 29, 23, 59, 59, 998993),
332+
pd.NaT,
333+
],
334+
},
335+
)
336+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)