diff --git a/pandas/_libs/new_vector.cpp b/pandas/_libs/new_vector.cpp index 29f06aae82106..26a7e3fb5a70e 100644 --- a/pandas/_libs/new_vector.cpp +++ b/pandas/_libs/new_vector.cpp @@ -76,16 +76,16 @@ template auto PandasIsNA(bool mask_value, T &scalar_value) { } } -template auto MaybeResizeKlibContainer(T &container) { - const auto current_size = container.size(); - if (container.n_buckets() == current_size) { - container.resize(current_size * 4); - } -} - template class PandasVector { public: - explicit PandasVector() : external_view_exists_(false) {} + static constexpr size_t INIT_VEC_CAP = 128; + + explicit PandasVector() : external_view_exists_(false) { + vec_.reserve(INIT_VEC_CAP); + } + explicit PandasVector(std::vector&& vec) : vec_(vec), external_view_exists_(false) { + vec_.reserve(INIT_VEC_CAP); + } ~PandasVector() = default; PandasVector(PandasVector const &) = delete; void operator=(PandasVector const &) = delete; @@ -137,8 +137,9 @@ template class PandasHashTable { explicit PandasHashTable(HashValueT new_size) { #if __APPLE__ // macOS cannot resolve size_t to uint32_t or uint64_t that khash needs - hash_map_.resize(static_cast(new_size)); - hash_set_.resize(static_cast(new_size)); + const auto ns = static_cast(new_size); + hash_map_.resize(ns); + hash_set_.resize(ns); #else hash_map_.resize(new_size); hash_set_.resize(new_size); @@ -226,7 +227,6 @@ template class PandasHashTable { const auto n = values_v.shape(0); for (auto i = decltype(n){0}; i < n; i++) { hash_map_[keys_v(i)] = values_v(i); - MaybeResizeKlibContainer(hash_map_); } } @@ -251,7 +251,6 @@ template class PandasHashTable { na_position = i; } else { hash_map_[values_v(i)] = i; - MaybeResizeKlibContainer(hash_map_); } } na_position_ = na_position; @@ -259,7 +258,6 @@ template class PandasHashTable { for (auto i = decltype(n){0}; i < n; i++) { const auto key = values_v(i); hash_map_[key] = i; - MaybeResizeKlibContainer(hash_map_); } } } @@ -428,7 +426,6 @@ template class PandasHashTable { int dummy; k = hash_map_.put(val, &dummy); hash_map_.value(k) = count; - MaybeResizeKlibContainer(hash_map_); uniques.Append(val); labels[i] = count; count++; @@ -487,7 +484,6 @@ template class PandasHashTable { k = hash_map_.put(val, &dummy); uniques.Append(val); hash_map_.value(k) = count_prior; - MaybeResizeKlibContainer(hash_map_); labels[i] = count_prior; count_prior++; } else { @@ -521,7 +517,6 @@ template class PandasHashTable { k = hash_map_.put(val, &dummy); uniques.Append(val); hash_map_.value(k) = count_prior; - MaybeResizeKlibContainer(hash_map_); labels[i] = count_prior; count_prior++; } else { @@ -550,8 +545,10 @@ template class PandasHashTable { const auto values_v = values.view(); const auto n = values.shape(0); - PandasVector result; + bool seen_na = false; + auto na_pos = decltype(n){0}; + std::vector missing_vec; if constexpr (IsMasked) { using MaskT = nb::ndarray>; MaskT mask; @@ -560,15 +557,13 @@ template class PandasHashTable { } nb::call_guard(); const auto mask_v = mask.view(); - - bool seen_na = false; for (auto i = decltype(n){0}; i < n; i++) { const auto val = values_v(i); if (PandasIsNA(mask_v(i), val)) { if (!seen_na) { uniques.Append(val); - result.Append(1); + na_pos = i; seen_na = true; } continue; @@ -576,27 +571,31 @@ template class PandasHashTable { int absent; hash_set_.put(val, &absent); - MaybeResizeKlibContainer(hash_set_); if (absent) { uniques.Append(val); - result.Append(0); } } } else { + // TODO: why do we even have this branch? nb::call_guard(); for (auto i = decltype(n){0}; i < n; i++) { const auto val = values_v(i); int absent; hash_set_.put(val, &absent); - MaybeResizeKlibContainer(hash_set_); if (absent) { uniques.Append(val); - result.Append(0); } } } - return result; + + std::vector tmp; + tmp.resize(hash_set_.n_buckets(), 0); + if (seen_na) { + tmp[na_pos] = 1; + } + + return PandasVector(std::move(tmp)); } auto UniquesOnly(const nb::ndarray> &values, @@ -612,7 +611,6 @@ template class PandasHashTable { if (k == hash_map_.end()) { int dummy; k = hash_map_.put(val, &dummy); - MaybeResizeKlibContainer(hash_map_); uniques.Append(val); } }