Skip to content

Commit c59dca0

Browse files
committed
Merge remote-tracking branch 'upstream/master' into pandas-array-interface-3
2 parents 7110b2a + fb3b237 commit c59dca0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+1067
-1755
lines changed

asv_bench/benchmarks/replace.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,15 @@ class Convert(object):
4444

4545
goal_time = 0.5
4646
params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta'])
47-
param_names = ['contructor', 'replace_data']
47+
param_names = ['constructor', 'replace_data']
4848

49-
def setup(self, contructor, replace_data):
49+
def setup(self, constructor, replace_data):
5050
N = 10**3
5151
data = {'Series': pd.Series(np.random.randint(N, size=N)),
5252
'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N),
5353
'B': np.random.randint(N, size=N)})}
5454
self.to_replace = {i: getattr(pd, replace_data) for i in range(N)}
55-
self.data = data[contructor]
55+
self.data = data[constructor]
5656

57-
def time_replace(self, contructor, replace_data):
57+
def time_replace(self, constructor, replace_data):
5858
self.data.replace(self.to_replace)

asv_bench/benchmarks/rolling.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@ class Methods(object):
1212
['int', 'float'],
1313
['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
1414
'sum', 'corr', 'cov'])
15-
param_names = ['contructor', 'window', 'dtype', 'method']
15+
param_names = ['constructor', 'window', 'dtype', 'method']
1616

17-
def setup(self, contructor, window, dtype, method):
17+
def setup(self, constructor, window, dtype, method):
1818
N = 10**5
1919
arr = np.random.random(N).astype(dtype)
20-
self.roll = getattr(pd, contructor)(arr).rolling(window)
20+
self.roll = getattr(pd, constructor)(arr).rolling(window)
2121

22-
def time_rolling(self, contructor, window, dtype, method):
22+
def time_rolling(self, constructor, window, dtype, method):
2323
getattr(self.roll, method)()
2424

2525

@@ -30,12 +30,12 @@ class Quantile(object):
3030
[10, 1000],
3131
['int', 'float'],
3232
[0, 0.5, 1])
33-
param_names = ['contructor', 'window', 'dtype', 'percentile']
33+
param_names = ['constructor', 'window', 'dtype', 'percentile']
3434

35-
def setup(self, contructor, window, dtype, percentile):
35+
def setup(self, constructor, window, dtype, percentile):
3636
N = 10**5
3737
arr = np.random.random(N).astype(dtype)
38-
self.roll = getattr(pd, contructor)(arr).rolling(window)
38+
self.roll = getattr(pd, constructor)(arr).rolling(window)
3939

40-
def time_quantile(self, contructor, window, dtype, percentile):
40+
def time_quantile(self, constructor, window, dtype, percentile):
4141
self.roll.quantile(percentile)

ci/requirements-3.6.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ lxml
1313
html5lib
1414
jinja2
1515
sqlalchemy
16-
pymysql
16+
pymysql<0.8.0
1717
feather-format
1818
pyarrow
1919
psycopg2

doc/source/10min.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ Selection
154154
While standard Python / Numpy expressions for selecting and setting are
155155
intuitive and come in handy for interactive work, for production code, we
156156
recommend the optimized pandas data access methods, ``.at``, ``.iat``,
157-
``.loc``, ``.iloc`` and ``.ix``.
157+
``.loc`` and ``.iloc``.
158158

159159
See the indexing documentation :ref:`Indexing and Selecting Data <indexing>` and :ref:`MultiIndex / Advanced Indexing <advanced>`.
160160

doc/source/api.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2500,7 +2500,7 @@ Scalar introspection
25002500
Extensions
25012501
----------
25022502

2503-
These are primarily intented for library authors looking to extend pandas
2503+
These are primarily intended for library authors looking to extend pandas
25042504
objects.
25052505

25062506
.. currentmodule:: pandas

doc/source/io.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2675,7 +2675,7 @@ file, and the ``sheet_name`` indicating which sheet to parse.
26752675
+++++++++++++++++++
26762676

26772677
To facilitate working with multiple sheets from the same file, the ``ExcelFile``
2678-
class can be used to wrap the file and can be be passed into ``read_excel``
2678+
class can be used to wrap the file and can be passed into ``read_excel``
26792679
There will be a performance benefit for reading multiple sheets as the file is
26802680
read into memory only once.
26812681

@@ -4537,7 +4537,7 @@ See the documentation for `pyarrow <http://arrow.apache.org/docs/python/>`__ and
45374537
.. note::
45384538

45394539
These engines are very similar and should read/write nearly identical parquet format files.
4540-
Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC).
4540+
Currently ``pyarrow`` does not support timedelta data, ``fastparquet>=0.1.4`` supports timezone aware datetimes.
45414541
These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).
45424542

45434543
.. ipython:: python

doc/source/whatsnew/v0.23.0.txt

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ Deprecations
329329
- :func:`read_excel` has deprecated the ``skip_footer`` parameter. Use ``skipfooter`` instead (:issue:`18836`)
330330
- The ``is_copy`` attribute is deprecated and will be removed in a future version (:issue:`18801`).
331331
- ``IntervalIndex.from_intervals`` is deprecated in favor of the :class:`IntervalIndex` constructor (:issue:`19263`)
332-
332+
- :func:``DataFrame.from_items`` is deprecated. Use :func:``DataFrame.from_dict()`` instead, or :func:``DataFrame.from_dict(OrderedDict())`` if you wish to preserve the key order (:issue:`17320`)
333333

334334
.. _whatsnew_0230.prior_deprecations:
335335

@@ -380,6 +380,7 @@ Performance Improvements
380380
- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`)
381381
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
382382
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
383+
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
383384

384385

385386
.. _whatsnew_0230.docs:
@@ -419,6 +420,7 @@ Datetimelike
419420
- Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`)
420421
- Bug in subtracting :class:`Series` from ``NaT`` incorrectly returning ``NaT`` (:issue:`19158`)
421422
- Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`)
423+
- Bug in :func:`~DataFrame.pct_change` using ``periods`` and ``freq`` returned different length outputs (:issue:`7292`)
422424

423425
Timezones
424426
^^^^^^^^^
@@ -431,6 +433,7 @@ Timezones
431433
- :func:`Timestamp.replace` will now handle Daylight Savings transitions gracefully (:issue:`18319`)
432434
- Bug in tz-aware :class:`DatetimeIndex` where addition/subtraction with a :class:`TimedeltaIndex` or array with ``dtype='timedelta64[ns]'`` was incorrect (:issue:`17558`)
433435
- Bug in :func:`DatetimeIndex.insert` where inserting ``NaT`` into a timezone-aware index incorrectly raised (:issue:`16357`)
436+
- Bug in the :class:`DataFrame` constructor, where tz-aware Datetimeindex and a given column name will result in an empty ``DataFrame`` (:issue:`19157`)
434437

435438
Offsets
436439
^^^^^^^
@@ -476,7 +479,11 @@ MultiIndex
476479
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
477480
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
478481
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
479-
-
482+
- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
483+
- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
484+
- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`)
485+
- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`)
486+
480487

481488
I/O
482489
^^^
@@ -489,6 +496,8 @@ I/O
489496
- Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
490497
- Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`)
491498
- Bug in :func:`DataFrame.to_parquet` where an exception was raised if the write destination is S3 (:issue:`19134`)
499+
- :class:`Interval` now supported in :func:`DataFrame.to_excel` for all Excel file types (:issue:`19242`)
500+
- :class:`Timedelta` now supported in :func:`DataFrame.to_excel` for xls file type (:issue:`19242`, :issue:`9155`)
492501
-
493502

494503
Plotting

doc/sphinxext/numpydoc/tests/test_docscrape.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
-------
4343
out : ndarray
4444
The drawn samples, arranged according to `shape`. If the
45-
shape given is (m,n,...), then the shape of `out` is is
45+
shape given is (m,n,...), then the shape of `out` is
4646
(m,n,...,N).
4747
4848
In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
@@ -222,7 +222,7 @@ def test_str():
222222
-------
223223
out : ndarray
224224
The drawn samples, arranged according to `shape`. If the
225-
shape given is (m,n,...), then the shape of `out` is is
225+
shape given is (m,n,...), then the shape of `out` is
226226
(m,n,...,N).
227227
228228
In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
@@ -340,7 +340,7 @@ def test_sphinx_str():
340340
**out** : ndarray
341341
342342
The drawn samples, arranged according to `shape`. If the
343-
shape given is (m,n,...), then the shape of `out` is is
343+
shape given is (m,n,...), then the shape of `out` is
344344
(m,n,...,N).
345345
346346
In other words, each entry ``out[i,j,...,:]`` is an N-dimensional

pandas/_libs/hashtable.pxd

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,6 @@ cdef class PyObjectHashTable(HashTable):
3131
cpdef get_item(self, object val)
3232
cpdef set_item(self, object key, Py_ssize_t val)
3333

34-
cdef class MultiIndexHashTable(HashTable):
35-
cdef:
36-
kh_uint64_t *table
37-
object mi
38-
39-
cpdef get_item(self, object val)
40-
cpdef set_item(self, object key, Py_ssize_t val)
41-
cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)
42-
4334

4435
cdef class StringHashTable(HashTable):
4536
cdef kh_str_t *table

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 0 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -899,139 +899,3 @@ cdef class PyObjectHashTable(HashTable):
899899
count += 1
900900

901901
return np.asarray(labels)
902-
903-
904-
cdef class MultiIndexHashTable(HashTable):
905-
906-
def __init__(self, size_hint=1):
907-
self.table = kh_init_uint64()
908-
self.mi = None
909-
kh_resize_uint64(self.table, size_hint)
910-
911-
def __dealloc__(self):
912-
if self.table is not NULL:
913-
kh_destroy_uint64(self.table)
914-
self.table = NULL
915-
916-
def __len__(self):
917-
return self.table.size
918-
919-
def sizeof(self, deep=False):
920-
""" return the size of my table in bytes """
921-
return self.table.n_buckets * (sizeof(uint64_t) + # keys
922-
sizeof(size_t) + # vals
923-
sizeof(uint32_t)) # flags
924-
925-
def _check_for_collisions(self, int64_t[:] locs, object mi):
926-
# validate that the locs map to the actual values
927-
# provided in the mi
928-
# we can only check if we *don't* have any missing values
929-
# :<
930-
cdef:
931-
ndarray[int64_t] alocs
932-
933-
alocs = np.asarray(locs)
934-
if (alocs != -1).all():
935-
936-
result = self.mi.take(locs)
937-
if isinstance(mi, tuple):
938-
from pandas import Index
939-
mi = Index([mi])
940-
if not result.equals(mi):
941-
raise AssertionError(
942-
"hash collision\nlocs:\n{}\n"
943-
"result:\n{}\nmi:\n{}".format(alocs, result, mi))
944-
945-
cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
946-
# validate that the loc maps to the actual value
947-
# version of _check_for_collisions above for single label (tuple)
948-
949-
result = self.mi[loc]
950-
951-
if not all(l == r or (is_null_datetimelike(l)
952-
and is_null_datetimelike(r))
953-
for l, r in zip(result, label)):
954-
raise AssertionError(
955-
"hash collision\nloc:\n{}\n"
956-
"result:\n{}\nmi:\n{}".format(loc, result, label))
957-
958-
def __contains__(self, object key):
959-
try:
960-
self.get_item(key)
961-
return True
962-
except (KeyError, ValueError, TypeError):
963-
return False
964-
965-
cpdef get_item(self, object key):
966-
cdef:
967-
khiter_t k
968-
uint64_t value
969-
int64_t[:] locs
970-
Py_ssize_t loc
971-
972-
value = self.mi._hashed_indexing_key(key)
973-
k = kh_get_uint64(self.table, value)
974-
if k != self.table.n_buckets:
975-
loc = self.table.vals[k]
976-
self._check_for_collision(loc, key)
977-
return loc
978-
else:
979-
raise KeyError(key)
980-
981-
cpdef set_item(self, object key, Py_ssize_t val):
982-
raise NotImplementedError
983-
984-
@cython.boundscheck(False)
985-
def map_locations(self, object mi):
986-
cdef:
987-
Py_ssize_t i, n
988-
ndarray[uint64_t] values
989-
uint64_t val
990-
int ret = 0
991-
khiter_t k
992-
993-
self.mi = mi
994-
n = len(mi)
995-
values = mi._hashed_values
996-
997-
with nogil:
998-
for i in range(n):
999-
val = values[i]
1000-
k = kh_put_uint64(self.table, val, &ret)
1001-
self.table.vals[k] = i
1002-
1003-
@cython.boundscheck(False)
1004-
def lookup(self, object mi):
1005-
# look up with a target mi
1006-
cdef:
1007-
Py_ssize_t i, n
1008-
ndarray[uint64_t] values
1009-
int ret = 0
1010-
uint64_t val
1011-
khiter_t k
1012-
int64_t[:] locs
1013-
1014-
n = len(mi)
1015-
values = mi._hashed_values
1016-
1017-
locs = np.empty(n, dtype=np.int64)
1018-
1019-
with nogil:
1020-
for i in range(n):
1021-
val = values[i]
1022-
k = kh_get_uint64(self.table, val)
1023-
if k != self.table.n_buckets:
1024-
locs[i] = self.table.vals[k]
1025-
else:
1026-
locs[i] = -1
1027-
1028-
self._check_for_collisions(locs, mi)
1029-
return np.asarray(locs)
1030-
1031-
def unique(self, object mi):
1032-
raise NotImplementedError
1033-
1034-
def get_labels(self, object mi, ObjectVector uniques,
1035-
Py_ssize_t count_prior, int64_t na_sentinel,
1036-
bint check_null=True):
1037-
raise NotImplementedError

0 commit comments

Comments
 (0)