Skip to content

Commit

Permalink
PERF: concatenation of MultiIndexed objects (MultiIndex.append) (pand…
Browse files Browse the repository at this point in the history
…as-dev#53697)

* improve perf of MultiIndex.append

* fix test

* style
  • Loading branch information
lukemanley authored Jun 20, 2023
1 parent b149e17 commit f989e1b
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 11 deletions.
26 changes: 26 additions & 0 deletions asv_bench/benchmarks/multiindex_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,4 +396,30 @@ def time_putmask_all_different(self):
self.midx.putmask(self.mask, self.midx_values_different)


class Append:
params = ["datetime64[ns]", "int64", "string"]
param_names = ["dtype"]

def setup(self, dtype):
N1 = 1000
N2 = 500
left_level1 = range(N1)
right_level1 = range(N1, N1 + N1)

if dtype == "datetime64[ns]":
level2 = date_range(start="2000-01-01", periods=N2)
elif dtype == "int64":
level2 = range(N2)
elif dtype == "string":
level2 = tm.makeStringIndex(N2)
else:
raise NotImplementedError

self.left = MultiIndex.from_product([left_level1, level2])
self.right = MultiIndex.from_product([right_level1, level2])

def time_append(self, dtype):
self.left.append(self.right)


from .pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ Performance improvements
- Performance improvement accessing :attr:`arrays.IntegerArrays.dtype` & :attr:`arrays.FloatingArray.dtype` (:issue:`52998`)
- Performance improvement in :class:`Series` reductions (:issue:`52341`)
- Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`)
- Performance improvement in :func:`concat` when the concatenation axis is a :class:`MultiIndex` (:issue:`53574`)
- Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`)
- Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`)
- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`)
Expand Down
33 changes: 25 additions & 8 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@
Categorical,
ExtensionArray,
)
from pandas.core.arrays.categorical import factorize_from_iterables
from pandas.core.arrays.categorical import (
factorize_from_iterables,
recode_for_categories,
)
import pandas.core.common as com
from pandas.core.construction import sanitize_array
import pandas.core.indexes.base as ibase
Expand Down Expand Up @@ -2145,14 +2148,28 @@ def append(self, other):
if all(
(isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other
):
arrays, names = [], []
codes = []
levels = []
names = []
for i in range(self.nlevels):
label = self._get_level_values(i)
appended = [o._get_level_values(i) for o in other]
arrays.append(label.append(appended))
single_label_name = all(label.name == x.name for x in appended)
names.append(label.name if single_label_name else None)
return MultiIndex.from_arrays(arrays, names=names)
level_values = self.levels[i]
for mi in other:
level_values = level_values.union(mi.levels[i])
level_codes = [
recode_for_categories(
mi.codes[i], mi.levels[i], level_values, copy=False
)
for mi in ([self, *other])
]
level_name = self.names[i]
if any(mi.names[i] != level_name for mi in other):
level_name = None
codes.append(np.concatenate(level_codes))
levels.append(level_values)
names.append(level_name)
return MultiIndex(
codes=codes, levels=levels, names=names, verify_integrity=False
)

to_concat = (self._values,) + tuple(k._values for k in other)
new_tuples = np.concatenate(to_concat)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ def test_combine_first_duplicates_rows_for_nan_index_values():
"y": [12.0, 13.0, np.nan, 14.0],
},
index=MultiIndex.from_arrays(
[[1, 2, 3, 4], [np.nan, 5.0, 6.0, 7.0]], names=["a", "b"]
[[1, 2, 3, 4], [np.nan, 5, 6, 7]], names=["a", "b"]
),
)
combined = df1.combine_first(df2)
Expand Down
2 changes: 0 additions & 2 deletions pandas/tests/indexes/multi/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,8 +558,6 @@ def test_union_with_missing_values_on_both_sides(nulls_fixture):
mi2 = MultiIndex.from_arrays([[1, nulls_fixture, 3]])
result = mi1.union(mi2)
expected = MultiIndex.from_arrays([[1, 3, nulls_fixture]])
# We don't particularly care about having levels[0] be float64, but it is
expected = expected.set_levels([expected.levels[0].astype(np.float64)])
tm.assert_index_equal(result, expected)


Expand Down

0 comments on commit f989e1b

Please sign in to comment.